1   package eu.fbk.dkm.pikes.resources.meantime;
2   
3   import eu.fbk.rdfpro.util.IO;
4   import eu.fbk.utils.core.CommandLine;
5   import ixa.kaflib.KAFDocument;
6   import org.slf4j.LoggerFactory;
7   import org.w3c.dom.Document;
8   import org.w3c.dom.Node;
9   import org.w3c.dom.NodeList;
10  import org.xml.sax.EntityResolver;
11  import org.xml.sax.InputSource;
12  import org.xml.sax.SAXException;
13  
14  import javax.xml.parsers.DocumentBuilder;
15  import javax.xml.parsers.DocumentBuilderFactory;
16  import java.io.File;
17  import java.io.IOException;
18  import java.io.Reader;
19  import java.io.StringReader;
20  import java.text.SimpleDateFormat;
21  import java.util.Date;
22  
23  /**
24   * Created by marcorospocher on 12/05/16.
25   */
26  public class ConvertDocsFromCatToken {
27  
28      private static final SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd'T'HH:mm:ss.SSSZ");
29  
30      private static final String DEFAULT_URL = "http://pikes.fbk.eu/conll/";
31  
32      private static final EntityResolver NULL_RESOLVER = new EntityResolver() {
33          public InputSource resolveEntity(String publicId, String systemId) throws SAXException, IOException {
34              return new InputSource(new StringReader(""));
35          }
36      };
37      
38      public static void main(String[] args) throws Exception {
39  
40  
41          final CommandLine cmd = CommandLine
42                  .parser()
43                  .withName("ConvertDocsFromCatToken")
44                  .withHeader("ConvertDocsFromCatToken")
45                  .withOption("i", "input-folder", "the folder of the input NAF corpus", "DIR", CommandLine.Type.DIRECTORY_EXISTING, true, false, true)
46                  .withOption("o", "output-folder", "the folder of the input NAF corpus", "DIR", CommandLine.Type.DIRECTORY, true, false, true)
47                  .withOption("s", "sentences", "limit to 5 sentences")
48  //                .withOption("o", "output", "Output file", "FILE", CommandLine.Type.FILE, true,
49  //                        false, true)
50                  .withLogger(LoggerFactory.getLogger("eu.fbk")).parse(args);
51  
52          File inputFolder = cmd.getOptionValue("input-folder", File.class);
53          File outputFolder = cmd.getOptionValue("output-folder", File.class);
54          boolean sentence = cmd.hasOption("s");
55  
56          for (final File file : com.google.common.io.Files.fileTreeTraverser().preOrderTraversal(inputFolder)) {
57              if (!file.isFile()) {
58                  continue;
59              }
60              if (file.getName().startsWith(".")) {
61                  continue;
62              }
63  
64              if (!file.getName().endsWith(".xml")) {
65                  continue;
66              }
67  
68  
69  
70              File outputFile = new File(file.getAbsoluteFile().toString().replace(inputFolder.getAbsolutePath(),outputFolder.getAbsolutePath()).replace(".xml",".naf"));
71  
72              if (!outputFile.exists()) {
73  
74  
75                  try (Reader reader = IO.utf8Reader(IO.buffer(IO.read(file.getAbsoluteFile().toString())))) {
76                      try {
77  
78                          //System.out.print(" WORKING");
79  
80  
81  //                        List<String> content = FileUtils.readLines(file, "utf-8");
82  //                        String header = content.get(0);
83  //                        List<String> token = content.stream().filter(line -> line.startsWith("<token")).collect(Collectors.toList());
84  //                        System.out.println("CIAO");
85  
86                          String rawText = "";
87  
88                          DocumentBuilderFactory dbf = DocumentBuilderFactory.newInstance();
89  
90                          dbf.setValidating(false);
91                          dbf.setIgnoringComments(false);
92                          dbf.setIgnoringElementContentWhitespace(true);
93                          dbf.setNamespaceAware(true);
94                          // dbf.setCoalescing(true);
95                          // dbf.setExpandEntityReferences(true);
96  
97                          DocumentBuilder db = null;
98                          db = dbf.newDocumentBuilder();
99                          db.setEntityResolver(NULL_RESOLVER);
100 
101                         // db.setErrorHandler( new MyErrorHandler());
102                         InputSource ips = new InputSource(reader);
103                         //return db.parse(ips);
104 
105                         Document catDoc = db.parse(ips);
106 
107                         Integer prevSentenceNum = 0;
108                         NodeList tokens = catDoc.getElementsByTagName("token");
109                         for(int k=0;k<tokens.getLength();k++){
110                             Node token = ((Node)tokens.item(k));
111                             String tk = token.getTextContent();
112 //                            System.out.println(token.getNodeName()+" : "+token.getTextContent());
113                             Integer sentenceNum = Integer.parseInt(token.getAttributes().getNamedItem("sentence").getTextContent());
114 //                            System.out.println(tk+"   "+sentenceNum);
115                             if (sentence)
116                                 if (sentenceNum > 5)
117                                     break;
118                             if (sentenceNum != prevSentenceNum) {
119                                 rawText = rawText + "\n";
120                                 prevSentenceNum = sentenceNum;
121                             }
122                             rawText=rawText+" "+tk;
123                         }
124                         System.out.println(rawText);
125 
126 
127                         if (!rawText.isEmpty()) {
128 
129                             outputFile.getParentFile().mkdirs();
130                             KAFDocument document = new KAFDocument("en", "v3");
131 
132                             document.save(outputFile.getAbsolutePath());
133 
134                             document.setRawText(rawText);
135 
136 
137 
138 
139                             KAFDocument.FileDesc fileDesc = document.createFileDesc();
140                             fileDesc.title = catDoc.getDocumentElement().getAttribute("doc_name");
141 
142                             Date thisDate = new Date();
143 
144                             fileDesc.creationtime = sdf.format(thisDate);
145                             String URL_str = catDoc.getDocumentElement().getAttribute("url");
146                             fileDesc.filename = catDoc.getDocumentElement().getAttribute("doc_name");
147 
148                             String urlTemplate = DEFAULT_URL;
149                             if (cmd.hasOption("url-template")) {
150                                 urlTemplate = cmd.getOptionValue("url-template", String.class);
151                             }
152 
153                             KAFDocument.Public aPublic = document.createPublic();
154                             //aPublic.uri = URL_str;
155                             aPublic.uri = URL_str;
156                             aPublic.publicId = catDoc.getDocumentElement().getAttribute("doc_id");
157 
158                             document.save(outputFile.getAbsolutePath());
159                         }
160 
161 
162 
163                     } catch (Exception e) {
164 
165                     }
166 
167                 }
168             } //else System.out.println(" SKIPPED");
169 
170 
171 
172 //
173 //
174 //
175 //
176 //                if (!text.isEmpty()) {
177 //                    File outputFile = new File(outputfile.getAbsoluteFile().toString() + "/" + StringUtils.leftPad(ID.toString(),4,"0") + ".naf");
178 //
179 //                    //File outputFile = new File(outputFileName);
180 //                    outputFile.getParentFile().mkdirs();
181 //                    KAFDocument document = new KAFDocument("en", "v3");
182 //
183 //                    document.save(outputFile.getAbsolutePath());
184 //
185 //                    document.setRawText(text);
186 //
187 //                    KAFDocument.FileDesc fileDesc = document.createFileDesc();
188 //                    fileDesc.title = ID.toString();
189 //
190 //                    Date thisDate = new Date();
191 //
192 //                    fileDesc.creationtime = sdf.format(thisDate);
193 //                    String URL_str = ID.toString();
194 //                    fileDesc.filename = URL_str;
195 //
196 //                    String urlTemplate = DEFAULT_URL;
197 //                    if (cmd.hasOption("url-template")) {
198 //                        urlTemplate = cmd.getOptionValue("url-template", String.class);
199 //                    }
200 //
201 //                    KAFDocument.Public aPublic = document.createPublic();
202 //                    //aPublic.uri = URL_str;
203 //                    aPublic.uri = urlTemplate + ID.toString();
204 //                    aPublic.publicId = ID.toString();
205 //
206 //                    document.save(outputFile.getAbsolutePath());
207 //                    text="";
208 //                    ID++;
209 //                }
210 //
211 //            } else if (line.isEmpty()) text+="\n";
212 //            else {
213 //                String[] conll_item = line.split(" ");
214 //                text+=conll_item[0]+" ";
215 //            }
216 
217         }
218     }
219 
220 }