1   package eu.fbk.dkm.pikes.resources.wt10g;
2   
3   import eu.fbk.utils.core.CommandLine;
4   import ixa.kaflib.KAFDocument;
5   import org.apache.commons.lang.time.DateUtils;
6   import org.apache.tika.exception.TikaException;
7   import org.apache.tika.metadata.Metadata;
8   import org.apache.tika.parser.ParseContext;
9   import org.apache.tika.parser.html.HtmlParser;
10  import org.apache.tika.sax.BodyContentHandler;
11  import org.htmlcleaner.CleanerProperties;
12  import org.htmlcleaner.HtmlCleaner;
13  import org.htmlcleaner.SimpleHtmlSerializer;
14  import org.htmlcleaner.TagNode;
15  import org.jsoup.Jsoup;
16  import org.jsoup.safety.Whitelist;
17  import org.slf4j.Logger;
18  import org.slf4j.LoggerFactory;
19  import org.unbescape.html.HtmlEscape;
20  import org.xml.sax.SAXException;
21  
22  import javax.xml.parsers.DocumentBuilder;
23  import javax.xml.parsers.DocumentBuilderFactory;
24  import java.io.*;
25  import java.nio.charset.Charset;
26  import java.nio.charset.StandardCharsets;
27  import java.text.ParseException;
28  import java.text.SimpleDateFormat;
29  import java.util.Date;
30  
31  /**
32   * Created by marcorospocher on 12/05/16.
33   */
34  public class ConvertDocs {
35  
36  
37      private static final Logger LOGGER = LoggerFactory.getLogger(ConvertDocs.class);
38      private static String DEFAULT_URL = "http://pikes.fbk.eu/ke4ir/wt10g/docs/";
39      private static SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd'T'HH:mm:ss.SSSZ");
40      //private static SimpleDateFormat sdf2 = new SimpleDateFormat("E, dd-MM-yy HH:mm:ss z");
41      //private static DateFormat format = new SimpleDateFormat("E, d m yyyy", Locale.ENGLISH);
42      //private static Pattern datePattern1 = Pattern.compile("^([a-zA-Z]+,\\s+[0-9]+[\\s,\\-][a-zA-Z]+[\\s,\\-][0-9]+)");
43      private static String[] parsePatterns = {
44              "E, dd-MMM-yy HH:mm:ss z",  //Wednesday, 01-Jan-97 15:20:23 GMT
45              "E, dd MMM yy HH:mm:ss z", //Fri, 17 Jan 97 02:15:05 GMT
46              "E, dd MMM yyyy HH:mm:ss z", //Wed, 01 Jan 1997 15:21:07 GM
47              "E MMM dd HH:mm:ss yyyy",    //Wed Mar  6 13:36:27 1996
48              "E, dd-MMM-yy HH:mm:ss yyyy",    //Friday, 21-Feb-97 15:29:07 1997
49              ", dd MMM yyyy HH:mm:ss z",    //, 17 Feb 1997 14:15:0 GMT
50              "MMM dd, yyyy",    //January 23, 1997
51              "E, dd-MMM-yy z", //Friday, 07-Feb-97 ? GMT
52              "E, dd MMM yy z", //Mon, 10 Feb 1997 ? GMT
53              //OK Fri, 13 Jan 1997 22:13:59 GMT
54              //OK Mon,10 Feb 97 14:00:41 +0000
55              "E, dd-MMM-yy HH z" //Sunday, 12-Jan-97 00 GMT
56  //            DATE PROBLEM!!! :Sunday, 12-Jan-97 00 GMT
57              //Tuesday, 21-Jan-97 00 GMT
58  
59      };
60  
61      public static void main(String[] args) {
62  
63          try {
64  
65              // ACTHUNG!!!
66              // PASS "-Dwt10g" in Maven for building the dependencies
67  
68              final CommandLine cmd = CommandLine
69                      .parser()
70                      .withName("wt10g-mysql-extractor")
71                      .withHeader("Extract documents from wt10g mysql dump and save them in NAF format")
72                      .withOption("i", "input", "Input file", "FILE", CommandLine.Type.FILE, true,
73                              false, true)
74                      .withOption("o", "output", "Output folder", "FOLDER", CommandLine.Type.DIRECTORY, true, false, true)
75                      .withOption("u", "url-template", "URL template (with %d for the ID)", "URL",
76                              CommandLine.Type.STRING, true, false, false)
77                      .withLogger(LoggerFactory.getLogger("eu.fbk")) //
78                      .parse(args);
79  
80              File inputfile = cmd.getOptionValue("input", File.class);
81              File outputFolder = cmd.getOptionValue("output", File.class);
82  
83              String urlTemplate = DEFAULT_URL;
84              if (cmd.hasOption("url-template")) {
85                  urlTemplate = cmd.getOptionValue("url-template", String.class);
86              }
87  
88              DocumentBuilderFactory dbFactory = DocumentBuilderFactory.newInstance();
89              DocumentBuilder dBuilder = dbFactory.newDocumentBuilder();
90  
91              LOGGER.info(inputfile.getName());
92  
93              //String content = FileUtils.readFileToString(inputfile, Charsets.UTF_8);
94  
95  
96              String document="";
97                  try(BufferedReader br = new BufferedReader(new InputStreamReader(new FileInputStream(inputfile), Charset.forName("Windows-1252")))) {
98                  for(String line; (line = br.readLine()) != null; ) {
99                      if (line.contains("INSERT INTO trecdocument VALUES (")) {
100                         //start of document
101                         document=line.replace("INSERT INTO trecdocument VALUES (","");
102                         if (document.trim().endsWith("');")) {
103                             //single line sql
104                             document=document.replace(");","");
105                             processWT10GDocument(document, outputFolder, urlTemplate);
106                             document="";
107                         } else document=document+"\n";
108                     } else if (line.trim().startsWith("');")) {
109                         //end of document
110                         line=line.replace(");","");
111                         document=document+line;
112                         processWT10GDocument(document, outputFolder, urlTemplate);
113                         document="";
114                     } else {
115                         //middle of nowhere
116                         if (!line.trim().isEmpty()) document=document+line+"\n";
117                     }
118                 }
119                 // line is not visible here.
120             }
121 
122         } catch (Exception e) {
123             CommandLine.fail(e);
124         }
125     }
126 
127 
128     private static void processWT10GDocument(String raw, File outputFolder, String urlTemplate) {
129 
130 
131 
132         String[] parts = raw.split("', '");
133 
134         String ID_str = parts[0].replaceFirst("'","").trim();
135         System.out.println("STARTING PROCESSING OF DOCUMENT :"+ID_str);
136         saveSingleSQL(raw,ID_str,outputFolder);
137 
138         String URL_str = parts[1].trim();;
139         String DATE_str = parts[2].trim().replace("?","").replace(",",", ");
140         //String DATE_str = "Sunday, 12-Jan-97 00 GMT".replace("?","").replace(",",", ");
141         String TEXT_str = parts[3].substring(0,parts[3].lastIndexOf("'"));
142 
143         if (!TEXT_str.trim().isEmpty()) {
144 
145             Date thisDate = null;
146 
147             try {
148                 thisDate = DateUtils.parseDate(DATE_str, parsePatterns);
149 
150             } catch (ParseException e) {
151                 //
152                 try {
153                     thisDate = DateUtils.parseDate("Wednesday, 01-Jan-97 00:00:00 GMT", parsePatterns);
154                     if (!DATE_str.contains("NULL") && !DATE_str.isEmpty())
155                         System.out.println("MALFORMED DATE PROBLEM in " + ID_str + "!!! :" + DATE_str);
156                 } catch (ParseException e1) {
157                     // never enter here
158                     thisDate = new Date();
159                 }
160 
161 
162             }
163 
164             String[] localName_parts = ID_str.split("-");
165             String localName = localName_parts[0] + File.separator + localName_parts[1] + File.separator + ID_str;
166             String outputFileName = outputFolder.getAbsolutePath() + File.separator + "naf-in" + File.separator + localName + ".naf";
167 
168 
169             //start cleaning content
170 
171             String text_to_clean = TEXT_str.replace("''", "'");
172 //        try {
173 //            //cleaned_text = parseToPlainText(ID_str,Jsoup.clean(TEXT_str.replace("''","'"), Whitelist.relaxed()));
174 //
175 //            cleaned_text = parseToPlainText(ID_str,TEXT_str.replace("''","'"));
176 //            if (cleaned_text.contains("</")||cleaned_text.contains("/>"))
177 //                System.out.println("POSSIBLE HTML TAGS: "+ID_str);
178 //
179 //            //save file if only parsing OK
180 //            File outputFile = new File(outputFileName);
181 //            outputFile.getParentFile().mkdirs();
182 //
183 //            KAFDocument document = new KAFDocument("en", "v3");
184 //            document.setRawText(cleaned_text.replace("]]","")); // added .replace("]]","") for cdata issue in kaflib
185 //
186 //            KAFDocument.FileDesc fileDesc = document.createFileDesc();
187 //            fileDesc.title = ID_str;
188 //
189 //            fileDesc.creationtime = sdf.format(thisDate);
190 //            fileDesc.filename = URL_str;
191 //
192 //            KAFDocument.Public aPublic = document.createPublic();
193 //            //aPublic.uri = URL_str;
194 //            aPublic.uri = urlTemplate+ID_str;
195 //            aPublic.publicId = ID_str;
196 //
197 //
198 //            document.save(outputFile.getAbsolutePath());
199 //
200 //
201 //
202 //        } catch (TikaException e) {
203 //            System.out.println("TIKA PROBLEMS PARSING - LEVEL1: "+ID_str);
204 //            //e.printStackTrace();
205 //            //
206 //            //System.out.println(TEXT_str);
207 //            String clean = Jsoup.clean(TEXT_str.replace("''","'"), Whitelist.relaxed());
208 //            //System.out.println(clean);
209 //            try {
210 //                cleaned_text=parseToPlainText(ID_str,clean);
211 ////                int i=0;
212 ////                while(cleaned_text.contains()&&i++<10) {
213 ////                    System.out.println("Iteration :"+i);
214 ////                    cleaned_text=parseToPlainText(cleaned_text);
215 ////                }
216 //
217 //
218 //                //save file if only parsing OK
219 //                File outputFile = new File(outputFileName);
220 //                outputFile.getParentFile().mkdirs();
221 //
222 //                KAFDocument document = new KAFDocument("en", "v3");
223 //                document.setRawText(cleaned_text);
224 //
225 //                KAFDocument.FileDesc fileDesc = document.createFileDesc();
226 //                fileDesc.title = ID_str;
227 //
228 //                fileDesc.creationtime = sdf.format(thisDate);
229 //                fileDesc.filename = URL_str;
230 //
231 //                KAFDocument.Public aPublic = document.createPublic();
232 //                //aPublic.uri = URL_str;
233 //                aPublic.uri = urlTemplate+ID_str;
234 //                aPublic.publicId = ID_str;
235 //
236 //
237 //                document.save(outputFile.getAbsolutePath());
238 //
239 //                saveSingleSQL(raw,ID_str,outputFolder);
240 //
241 //            } catch (IOException e1) {
242 //                System.out.println("TIKA PROBLEMS PARSING - LEVEL2: "+ID_str);
243 //                e1.printStackTrace();
244 //                //saveSingleSQL(raw,ID_str,outputFolder);
245 //            } catch (SAXException e1) {
246 //                System.out.println("SAX PROBLEMS PARSING - LEVEL2: "+ID_str);
247 //                e1.printStackTrace();
248 //                //saveSingleSQL(raw,ID_str,outputFolder);
249 //            } catch (TikaException e1) {
250 //                System.out.println("IO PROBLEMS PARSING - LEVEL2: "+ID_str);
251 //                e1.printStackTrace();
252 //                //saveSingleSQL(raw,ID_str,outputFolder);
253 //            }
254 //
255 //
256 //        } catch (SAXException e) {
257 //            System.out.println("SAX PROBLEMS PARSING: "+ID_str);
258 //            e.printStackTrace();
259 //            //saveSingleSQL(raw,ID_str,outputFolder);
260 //        } catch (IOException e) {
261 //            System.out.println("IO PROBLEMS PARSING: "+ID_str);
262 //            e.printStackTrace();
263 //            //saveSingleSQL(raw,ID_str,outputFolder);
264 //        } catch (Throwable e) {
265 //            System.out.println("OTHERS PARSING: "+ID_str);
266 //            e.printStackTrace();
267 //            //saveSingleSQL(raw,ID_str,outputFolder);
268 //        }
269 //
270 //        //System.out.println("TEXT_str output: "+cleaned_text);
271 
272             HtmlCleaner cleaner = new HtmlCleaner();
273 
274 // take default cleaner properties
275             CleanerProperties props = cleaner.getProperties();
276 
277 // customize cleaner's behaviour with property setters
278             props.setAddNewlineToHeadAndBody(true);
279 
280             final SimpleHtmlSerializer htmlSerializer =
281                     new SimpleHtmlSerializer(props);
282 
283 // Clean HTML taken from simple string, file, URL, input stream,
284 // input source or reader. Result is root node of created
285 // tree-like structure. Single cleaner instance may be safely used
286 // multiple times.
287             String cleaned_text = "";
288             try {
289                 TagNode node = cleaner.clean(text_to_clean);
290                 //cleaned_text = HtmlEscape.unescapeHtml(node.getText().toString());
291                 cleaned_text=cleaner.clean(htmlSerializer.getAsString(node)).getText().toString();
292 
293 //                try {
294 //                    cleaned_text = parseToPlainText(ID_str,htmlSerializer.getAsString(node));
295 //                } catch (IOException e1) {
296 //                    e1.printStackTrace();
297 //                } catch (SAXException e1) {
298 //                    e1.printStackTrace();
299 //                } catch (TikaException e1) {
300 //                    e1.printStackTrace();
301 //                }
302 
303             } catch (IllegalArgumentException e) {
304                 //e.printStackTrace();
305                 System.out.println("POSSIBLE ILLEGAL CHARACTERS : " + ID_str);
306                 text_to_clean = HtmlEscape.unescapeHtml(text_to_clean);
307                 TagNode node = cleaner.clean(text_to_clean);
308                 //cleaned_text = HtmlEscape.unescapeHtml(node.getText().toString());
309                 try {
310                     cleaned_text = parseToPlainText(ID_str,htmlSerializer.getAsString(node));
311                 } catch (IOException e1) {
312                     e1.printStackTrace();
313                 } catch (SAXException e1) {
314                     e1.printStackTrace();
315                 } catch (TikaException e1) {
316                     e1.printStackTrace();
317                 }
318             } catch (ClassCastException e) {
319                 //e.printStackTrace();
320                 System.out.println("POSSIBLE CAST ISSUE - JSOUPED: " + ID_str);
321                 text_to_clean = Jsoup.clean(text_to_clean, Whitelist.simpleText());
322                 TagNode node = cleaner.clean(text_to_clean);
323                 //cleaned_text = HtmlEscape.unescapeHtml(node.getText().toString());
324                 try {
325                     cleaned_text = parseToPlainText(ID_str,htmlSerializer.getAsString(node));
326                 } catch (IOException e1) {
327                     e1.printStackTrace();
328                 } catch (SAXException e1) {
329                     e1.printStackTrace();
330                 } catch (TikaException e1) {
331                     e1.printStackTrace();
332                 }
333             } catch (Throwable e) {
334 //            System.out.println("OTHERS PARSING: "+ID_str);
335 //            e.printStackTrace();
336 //            //saveSingleSQL(raw,ID_str,outputFolder);
337                 System.out.println("OTHERS PARSING ERRORS: " + ID_str);
338                 //WTX057-B29-245
339                 text_to_clean = Jsoup.clean(text_to_clean, Whitelist.simpleText());
340                 TagNode node = cleaner.clean(text_to_clean);
341                 //cleaned_text = HtmlEscape.unescapeHtml(node.getText().toString());
342                 try {
343                     cleaned_text = parseToPlainText(ID_str,htmlSerializer.getAsString(node));
344                 } catch (IOException e1) {
345                     e1.printStackTrace();
346                 } catch (SAXException e1) {
347                     e1.printStackTrace();
348                 } catch (TikaException e1) {
349                     e1.printStackTrace();
350                 }
351                 //e.printStackTrace();
352             }
353 
354 
355             if (cleaned_text.equals(text_to_clean)) System.out.println("POSSIBLE PROBLEM IN CLEANING HTML: " + ID_str);
356             if (cleaned_text.contains("</") || cleaned_text.contains("/>"))
357                 System.out.println("POSSIBLE HTML TAGS IN: " + ID_str);
358 
359             //System.out.println("File :"+ID_str);
360             //System.out.println("File :"+cleaned_text);
361 
362             File outputFile = new File(outputFileName);
363             outputFile.getParentFile().mkdirs();
364 
365             KAFDocument document = new KAFDocument("en", "v3");
366 
367             while (cleaned_text.contains("]]>"))
368                 cleaned_text = cleaned_text.replace("]]>", "");
369 
370             //document.setRawText(postProcess(cleaned_text));
371             document.setRawText(cleaned_text);
372 
373             KAFDocument.FileDesc fileDesc = document.createFileDesc();
374             fileDesc.title = ID_str;
375 
376             fileDesc.creationtime = sdf.format(thisDate);
377             fileDesc.filename = URL_str;
378 
379             KAFDocument.Public aPublic = document.createPublic();
380             //aPublic.uri = URL_str;
381             aPublic.uri = urlTemplate + ID_str;
382             aPublic.publicId = ID_str;
383 
384             document.save(outputFile.getAbsolutePath());
385 
386 
387 
388         } else System.out.println("EMPTY FILE DISCARDED:" + ID_str);
389         System.out.println("PROCESSING OF DOCUMENT CONCLUDED:" + ID_str);
390     }
391 
392 
393     private static String parseToPlainText(String ID_str, String TEXT_str) throws IOException, SAXException, TikaException {
394         BodyContentHandler handler = new BodyContentHandler(-1);
395 
396         HtmlParser parser = new HtmlParser();
397         //AutoDetectParser parser = new AutoDetectParser();
398         Metadata metadata = new Metadata();
399         ParseContext pcontext = new ParseContext();
400         try (InputStream stream = new ByteArrayInputStream( TEXT_str.getBytes(  ) )) {
401             parser.parse(stream, handler, metadata,pcontext);
402             //System.out.println(metadata.get("title"));
403             String title = metadata.get("title");
404             if (title==null) title ="";
405             String content=handler.toString();
406             //if (content.equalsIgnoreCase(TEXT_str)) System.out.println("SAME FILE AFTER PARSED!!!: "+ID_str);
407             return title+"\n"+content;
408         }
409     }
410 
411     private static void saveSingleSQL(String raw, String ID_str, File outputFolder ){
412 
413         String[] localName_parts =  ID_str.split("-");
414         String localName = localName_parts[0]+File.separator+localName_parts[1]+File.separator+ID_str;
415 
416         BufferedWriter writer = null;
417         try {
418 
419             File errorFile = new File(outputFolder.getAbsolutePath()+File.separator+"sql"+File.separator+localName+".sql");
420             errorFile.getParentFile().mkdirs();
421 
422             writer = new BufferedWriter(new FileWriter(errorFile));
423             writer.write("INSERT INTO trecdocument VALUES ("+raw+");");
424         } catch (Exception e) {
425             e.printStackTrace();
426         } finally {
427             try {
428                 // Close the writer regardless of what happens...
429                 writer.close();
430             } catch (Exception e) {
431             }
432         }
433 
434     }
435 
436     private static String postProcess(String data) {
437         //do something
438 
439         String[] arr_data = data.split("\\n",-1);
440         String final_data="";
441 
442         for(int i=0;i<arr_data.length;i++){
443             if (!arr_data[i].isEmpty()) final_data=final_data+arr_data[i]+" ";
444             else final_data=final_data+arr_data[i]+"\n";
445         }
446         return final_data;
447     }
448 
449 }