1 package eu.fbk.dkm.pikes.resources.wt10g;
2
3 import eu.fbk.utils.core.CommandLine;
4 import ixa.kaflib.KAFDocument;
5 import org.apache.commons.lang.time.DateUtils;
6 import org.apache.tika.exception.TikaException;
7 import org.apache.tika.metadata.Metadata;
8 import org.apache.tika.parser.ParseContext;
9 import org.apache.tika.parser.html.HtmlParser;
10 import org.apache.tika.sax.BodyContentHandler;
11 import org.htmlcleaner.CleanerProperties;
12 import org.htmlcleaner.HtmlCleaner;
13 import org.htmlcleaner.SimpleHtmlSerializer;
14 import org.htmlcleaner.TagNode;
15 import org.jsoup.Jsoup;
16 import org.jsoup.safety.Whitelist;
17 import org.slf4j.Logger;
18 import org.slf4j.LoggerFactory;
19 import org.unbescape.html.HtmlEscape;
20 import org.xml.sax.SAXException;
21
22 import javax.xml.parsers.DocumentBuilder;
23 import javax.xml.parsers.DocumentBuilderFactory;
24 import java.io.*;
25 import java.nio.charset.Charset;
26 import java.nio.charset.StandardCharsets;
27 import java.text.ParseException;
28 import java.text.SimpleDateFormat;
29 import java.util.Date;
30
31
32
33
34 public class ConvertDocs {
35
36
37 private static final Logger LOGGER = LoggerFactory.getLogger(ConvertDocs.class);
38 private static String DEFAULT_URL = "http://pikes.fbk.eu/ke4ir/wt10g/docs/";
39 private static SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd'T'HH:mm:ss.SSSZ");
40
41
42
43 private static String[] parsePatterns = {
44 "E, dd-MMM-yy HH:mm:ss z",
45 "E, dd MMM yy HH:mm:ss z",
46 "E, dd MMM yyyy HH:mm:ss z",
47 "E MMM dd HH:mm:ss yyyy",
48 "E, dd-MMM-yy HH:mm:ss yyyy",
49 ", dd MMM yyyy HH:mm:ss z",
50 "MMM dd, yyyy",
51 "E, dd-MMM-yy z",
52 "E, dd MMM yy z",
53
54
55 "E, dd-MMM-yy HH z"
56
57
58
59 };
60
61 public static void main(String[] args) {
62
63 try {
64
65
66
67
68 final CommandLine cmd = CommandLine
69 .parser()
70 .withName("wt10g-mysql-extractor")
71 .withHeader("Extract documents from wt10g mysql dump and save them in NAF format")
72 .withOption("i", "input", "Input file", "FILE", CommandLine.Type.FILE, true,
73 false, true)
74 .withOption("o", "output", "Output folder", "FOLDER", CommandLine.Type.DIRECTORY, true, false, true)
75 .withOption("u", "url-template", "URL template (with %d for the ID)", "URL",
76 CommandLine.Type.STRING, true, false, false)
77 .withLogger(LoggerFactory.getLogger("eu.fbk"))
78 .parse(args);
79
80 File inputfile = cmd.getOptionValue("input", File.class);
81 File outputFolder = cmd.getOptionValue("output", File.class);
82
83 String urlTemplate = DEFAULT_URL;
84 if (cmd.hasOption("url-template")) {
85 urlTemplate = cmd.getOptionValue("url-template", String.class);
86 }
87
88 DocumentBuilderFactory dbFactory = DocumentBuilderFactory.newInstance();
89 DocumentBuilder dBuilder = dbFactory.newDocumentBuilder();
90
91 LOGGER.info(inputfile.getName());
92
93
94
95
96 String document="";
97 try(BufferedReader br = new BufferedReader(new InputStreamReader(new FileInputStream(inputfile), Charset.forName("Windows-1252")))) {
98 for(String line; (line = br.readLine()) != null; ) {
99 if (line.contains("INSERT INTO trecdocument VALUES (")) {
100
101 document=line.replace("INSERT INTO trecdocument VALUES (","");
102 if (document.trim().endsWith("');")) {
103
104 document=document.replace(");","");
105 processWT10GDocument(document, outputFolder, urlTemplate);
106 document="";
107 } else document=document+"\n";
108 } else if (line.trim().startsWith("');")) {
109
110 line=line.replace(");","");
111 document=document+line;
112 processWT10GDocument(document, outputFolder, urlTemplate);
113 document="";
114 } else {
115
116 if (!line.trim().isEmpty()) document=document+line+"\n";
117 }
118 }
119
120 }
121
122 } catch (Exception e) {
123 CommandLine.fail(e);
124 }
125 }
126
127
128 private static void processWT10GDocument(String raw, File outputFolder, String urlTemplate) {
129
130
131
132 String[] parts = raw.split("', '");
133
134 String ID_str = parts[0].replaceFirst("'","").trim();
135 System.out.println("STARTING PROCESSING OF DOCUMENT :"+ID_str);
136 saveSingleSQL(raw,ID_str,outputFolder);
137
138 String URL_str = parts[1].trim();;
139 String DATE_str = parts[2].trim().replace("?","").replace(",",", ");
140
141 String TEXT_str = parts[3].substring(0,parts[3].lastIndexOf("'"));
142
143 if (!TEXT_str.trim().isEmpty()) {
144
145 Date thisDate = null;
146
147 try {
148 thisDate = DateUtils.parseDate(DATE_str, parsePatterns);
149
150 } catch (ParseException e) {
151
152 try {
153 thisDate = DateUtils.parseDate("Wednesday, 01-Jan-97 00:00:00 GMT", parsePatterns);
154 if (!DATE_str.contains("NULL") && !DATE_str.isEmpty())
155 System.out.println("MALFORMED DATE PROBLEM in " + ID_str + "!!! :" + DATE_str);
156 } catch (ParseException e1) {
157
158 thisDate = new Date();
159 }
160
161
162 }
163
164 String[] localName_parts = ID_str.split("-");
165 String localName = localName_parts[0] + File.separator + localName_parts[1] + File.separator + ID_str;
166 String outputFileName = outputFolder.getAbsolutePath() + File.separator + "naf-in" + File.separator + localName + ".naf";
167
168
169
170
171 String text_to_clean = TEXT_str.replace("''", "'");
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272 HtmlCleaner cleaner = new HtmlCleaner();
273
274
275 CleanerProperties props = cleaner.getProperties();
276
277
278 props.setAddNewlineToHeadAndBody(true);
279
280 final SimpleHtmlSerializer htmlSerializer =
281 new SimpleHtmlSerializer(props);
282
283
284
285
286
287 String cleaned_text = "";
288 try {
289 TagNode node = cleaner.clean(text_to_clean);
290
291 cleaned_text=cleaner.clean(htmlSerializer.getAsString(node)).getText().toString();
292
293
294
295
296
297
298
299
300
301
302
303 } catch (IllegalArgumentException e) {
304
305 System.out.println("POSSIBLE ILLEGAL CHARACTERS : " + ID_str);
306 text_to_clean = HtmlEscape.unescapeHtml(text_to_clean);
307 TagNode node = cleaner.clean(text_to_clean);
308
309 try {
310 cleaned_text = parseToPlainText(ID_str,htmlSerializer.getAsString(node));
311 } catch (IOException e1) {
312 e1.printStackTrace();
313 } catch (SAXException e1) {
314 e1.printStackTrace();
315 } catch (TikaException e1) {
316 e1.printStackTrace();
317 }
318 } catch (ClassCastException e) {
319
320 System.out.println("POSSIBLE CAST ISSUE - JSOUPED: " + ID_str);
321 text_to_clean = Jsoup.clean(text_to_clean, Whitelist.simpleText());
322 TagNode node = cleaner.clean(text_to_clean);
323
324 try {
325 cleaned_text = parseToPlainText(ID_str,htmlSerializer.getAsString(node));
326 } catch (IOException e1) {
327 e1.printStackTrace();
328 } catch (SAXException e1) {
329 e1.printStackTrace();
330 } catch (TikaException e1) {
331 e1.printStackTrace();
332 }
333 } catch (Throwable e) {
334
335
336
337 System.out.println("OTHERS PARSING ERRORS: " + ID_str);
338
339 text_to_clean = Jsoup.clean(text_to_clean, Whitelist.simpleText());
340 TagNode node = cleaner.clean(text_to_clean);
341
342 try {
343 cleaned_text = parseToPlainText(ID_str,htmlSerializer.getAsString(node));
344 } catch (IOException e1) {
345 e1.printStackTrace();
346 } catch (SAXException e1) {
347 e1.printStackTrace();
348 } catch (TikaException e1) {
349 e1.printStackTrace();
350 }
351
352 }
353
354
355 if (cleaned_text.equals(text_to_clean)) System.out.println("POSSIBLE PROBLEM IN CLEANING HTML: " + ID_str);
356 if (cleaned_text.contains("</") || cleaned_text.contains("/>"))
357 System.out.println("POSSIBLE HTML TAGS IN: " + ID_str);
358
359
360
361
362 File outputFile = new File(outputFileName);
363 outputFile.getParentFile().mkdirs();
364
365 KAFDocument document = new KAFDocument("en", "v3");
366
367 while (cleaned_text.contains("]]>"))
368 cleaned_text = cleaned_text.replace("]]>", "");
369
370
371 document.setRawText(cleaned_text);
372
373 KAFDocument.FileDesc fileDesc = document.createFileDesc();
374 fileDesc.title = ID_str;
375
376 fileDesc.creationtime = sdf.format(thisDate);
377 fileDesc.filename = URL_str;
378
379 KAFDocument.Public aPublic = document.createPublic();
380
381 aPublic.uri = urlTemplate + ID_str;
382 aPublic.publicId = ID_str;
383
384 document.save(outputFile.getAbsolutePath());
385
386
387
388 } else System.out.println("EMPTY FILE DISCARDED:" + ID_str);
389 System.out.println("PROCESSING OF DOCUMENT CONCLUDED:" + ID_str);
390 }
391
392
393 private static String parseToPlainText(String ID_str, String TEXT_str) throws IOException, SAXException, TikaException {
394 BodyContentHandler handler = new BodyContentHandler(-1);
395
396 HtmlParser parser = new HtmlParser();
397
398 Metadata metadata = new Metadata();
399 ParseContext pcontext = new ParseContext();
400 try (InputStream stream = new ByteArrayInputStream( TEXT_str.getBytes( ) )) {
401 parser.parse(stream, handler, metadata,pcontext);
402
403 String title = metadata.get("title");
404 if (title==null) title ="";
405 String content=handler.toString();
406
407 return title+"\n"+content;
408 }
409 }
410
411 private static void saveSingleSQL(String raw, String ID_str, File outputFolder ){
412
413 String[] localName_parts = ID_str.split("-");
414 String localName = localName_parts[0]+File.separator+localName_parts[1]+File.separator+ID_str;
415
416 BufferedWriter writer = null;
417 try {
418
419 File errorFile = new File(outputFolder.getAbsolutePath()+File.separator+"sql"+File.separator+localName+".sql");
420 errorFile.getParentFile().mkdirs();
421
422 writer = new BufferedWriter(new FileWriter(errorFile));
423 writer.write("INSERT INTO trecdocument VALUES ("+raw+");");
424 } catch (Exception e) {
425 e.printStackTrace();
426 } finally {
427 try {
428
429 writer.close();
430 } catch (Exception e) {
431 }
432 }
433
434 }
435
436 private static String postProcess(String data) {
437
438
439 String[] arr_data = data.split("\\n",-1);
440 String final_data="";
441
442 for(int i=0;i<arr_data.length;i++){
443 if (!arr_data[i].isEmpty()) final_data=final_data+arr_data[i]+" ";
444 else final_data=final_data+arr_data[i]+"\n";
445 }
446 return final_data;
447 }
448
449 }