1 package eu.fbk.dkm.pikes.resources.meantime;
2
3 import eu.fbk.rdfpro.util.IO;
4 import eu.fbk.utils.core.CommandLine;
5 import ixa.kaflib.KAFDocument;
6 import org.slf4j.LoggerFactory;
7 import org.w3c.dom.Document;
8 import org.w3c.dom.Node;
9 import org.w3c.dom.NodeList;
10 import org.xml.sax.EntityResolver;
11 import org.xml.sax.InputSource;
12 import org.xml.sax.SAXException;
13
14 import javax.xml.parsers.DocumentBuilder;
15 import javax.xml.parsers.DocumentBuilderFactory;
16 import java.io.File;
17 import java.io.IOException;
18 import java.io.Reader;
19 import java.io.StringReader;
20 import java.text.SimpleDateFormat;
21 import java.util.Date;
22
23
24
25
26 public class ConvertDocsFromCatToken {
27
28 private static final SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd'T'HH:mm:ss.SSSZ");
29
30 private static final String DEFAULT_URL = "http://pikes.fbk.eu/conll/";
31
32 private static final EntityResolver NULL_RESOLVER = new EntityResolver() {
33 public InputSource resolveEntity(String publicId, String systemId) throws SAXException, IOException {
34 return new InputSource(new StringReader(""));
35 }
36 };
37
38 public static void main(String[] args) throws Exception {
39
40
41 final CommandLine cmd = CommandLine
42 .parser()
43 .withName("ConvertDocsFromCatToken")
44 .withHeader("ConvertDocsFromCatToken")
45 .withOption("i", "input-folder", "the folder of the input NAF corpus", "DIR", CommandLine.Type.DIRECTORY_EXISTING, true, false, true)
46 .withOption("o", "output-folder", "the folder of the input NAF corpus", "DIR", CommandLine.Type.DIRECTORY, true, false, true)
47 .withOption("s", "sentences", "limit to 5 sentences")
48
49
50 .withLogger(LoggerFactory.getLogger("eu.fbk")).parse(args);
51
52 File inputFolder = cmd.getOptionValue("input-folder", File.class);
53 File outputFolder = cmd.getOptionValue("output-folder", File.class);
54 boolean sentence = cmd.hasOption("s");
55
56 for (final File file : com.google.common.io.Files.fileTreeTraverser().preOrderTraversal(inputFolder)) {
57 if (!file.isFile()) {
58 continue;
59 }
60 if (file.getName().startsWith(".")) {
61 continue;
62 }
63
64 if (!file.getName().endsWith(".xml")) {
65 continue;
66 }
67
68
69
70 File outputFile = new File(file.getAbsoluteFile().toString().replace(inputFolder.getAbsolutePath(),outputFolder.getAbsolutePath()).replace(".xml",".naf"));
71
72 if (!outputFile.exists()) {
73
74
75 try (Reader reader = IO.utf8Reader(IO.buffer(IO.read(file.getAbsoluteFile().toString())))) {
76 try {
77
78
79
80
81
82
83
84
85
86 String rawText = "";
87
88 DocumentBuilderFactory dbf = DocumentBuilderFactory.newInstance();
89
90 dbf.setValidating(false);
91 dbf.setIgnoringComments(false);
92 dbf.setIgnoringElementContentWhitespace(true);
93 dbf.setNamespaceAware(true);
94
95
96
97 DocumentBuilder db = null;
98 db = dbf.newDocumentBuilder();
99 db.setEntityResolver(NULL_RESOLVER);
100
101
102 InputSource ips = new InputSource(reader);
103
104
105 Document catDoc = db.parse(ips);
106
107 Integer prevSentenceNum = 0;
108 NodeList tokens = catDoc.getElementsByTagName("token");
109 for(int k=0;k<tokens.getLength();k++){
110 Node token = ((Node)tokens.item(k));
111 String tk = token.getTextContent();
112
113 Integer sentenceNum = Integer.parseInt(token.getAttributes().getNamedItem("sentence").getTextContent());
114
115 if (sentence)
116 if (sentenceNum > 5)
117 break;
118 if (sentenceNum != prevSentenceNum) {
119 rawText = rawText + "\n";
120 prevSentenceNum = sentenceNum;
121 }
122 rawText=rawText+" "+tk;
123 }
124 System.out.println(rawText);
125
126
127 if (!rawText.isEmpty()) {
128
129 outputFile.getParentFile().mkdirs();
130 KAFDocument document = new KAFDocument("en", "v3");
131
132 document.save(outputFile.getAbsolutePath());
133
134 document.setRawText(rawText);
135
136
137
138
139 KAFDocument.FileDesc fileDesc = document.createFileDesc();
140 fileDesc.title = catDoc.getDocumentElement().getAttribute("doc_name");
141
142 Date thisDate = new Date();
143
144 fileDesc.creationtime = sdf.format(thisDate);
145 String URL_str = catDoc.getDocumentElement().getAttribute("url");
146 fileDesc.filename = catDoc.getDocumentElement().getAttribute("doc_name");
147
148 String urlTemplate = DEFAULT_URL;
149 if (cmd.hasOption("url-template")) {
150 urlTemplate = cmd.getOptionValue("url-template", String.class);
151 }
152
153 KAFDocument.Public aPublic = document.createPublic();
154
155 aPublic.uri = URL_str;
156 aPublic.publicId = catDoc.getDocumentElement().getAttribute("doc_id");
157
158 document.save(outputFile.getAbsolutePath());
159 }
160
161
162
163 } catch (Exception e) {
164
165 }
166
167 }
168 }
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217 }
218 }
219
220 }