1   package eu.fbk.dkm.pikes.resources.trec;
2   
3   import com.google.common.base.Charsets;
4   import com.google.common.io.Files;
5   import eu.fbk.utils.core.CommandLine;
6   import ixa.kaflib.KAFDocument;
7   import org.joox.JOOX;
8   import org.slf4j.Logger;
9   import org.slf4j.LoggerFactory;
10  import org.w3c.dom.Document;
11  import org.w3c.dom.Element;
12  import org.xml.sax.SAXException;
13  
14  import javax.xml.parsers.DocumentBuilder;
15  import javax.xml.parsers.DocumentBuilderFactory;
16  import javax.xml.parsers.ParserConfigurationException;
17  import java.io.ByteArrayInputStream;
18  import java.io.File;
19  import java.io.IOException;
20  import java.io.InputStream;
21  import java.text.DateFormat;
22  import java.text.SimpleDateFormat;
23  import java.util.Date;
24  import java.util.Locale;
25  import java.util.regex.Matcher;
26  import java.util.regex.Pattern;
27  
28  /**
29   * Created by alessio on 27/11/15.
30   *
31   * Warning: two empty documents in LA051090 (LA051090-0221 and LA051090-0222)
32   */
33  
34  public class LATIMES {
35  
36      private static final Logger LOGGER = LoggerFactory.getLogger(LATIMES.class);
37      private static String DEFAULT_URL = "http://document/%s";
38      private static SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd'T'HH:mm:ss.SSSZ");
39      private static DateFormat format = new SimpleDateFormat("MMMM d, yyyy", Locale.ENGLISH);
40      private static Pattern datePattern = Pattern.compile("^([a-zA-Z]+\\s+[0-9]+,\\s+[0-9]+)");
41  
42      public static void main(String[] args) {
43  
44          try {
45  
46              final CommandLine cmd = CommandLine
47                      .parser()
48                      .withName("latimes-extractor")
49                      .withHeader("Extract LATIMES documents from TREC dataset and save them in NAF format")
50                      .withOption("i", "input", "Input folder", "FOLDER", CommandLine.Type.DIRECTORY_EXISTING, true,
51                              false, true)
52                      .withOption("o", "output", "Output folder", "FOLDER", CommandLine.Type.DIRECTORY, true, false, true)
53                      .withOption("u", "url-template", "URL template (with %d for the ID)", "URL",
54                              CommandLine.Type.STRING, true, false, false)
55                      .withLogger(LoggerFactory.getLogger("eu.fbk")) //
56                      .parse(args);
57  
58              File inputDir = cmd.getOptionValue("input", File.class);
59  
60              String urlTemplate = DEFAULT_URL;
61              if (cmd.hasOption("url-template")) {
62                  urlTemplate = cmd.getOptionValue("url-template", String.class);
63              }
64  
65              File outputDir = cmd.getOptionValue("output", File.class);
66              if (!outputDir.exists()) {
67                  outputDir.mkdirs();
68              }
69  
70              for (final File file : Files.fileTreeTraverser().preOrderTraversal(inputDir)) {
71                  if (!file.isFile()) {
72                      continue;
73                  }
74                  if (file.getName().startsWith(".")) {
75                      continue;
76                  }
77  
78                  String outputTemplate = outputDir.getAbsolutePath() + File.separator + file.getName();
79                  File newFolder = new File(outputTemplate);
80                  newFolder.mkdirs();
81  
82                  outputTemplate += File.separator + "NAF";
83                  saveFile(file, outputTemplate, urlTemplate);
84              }
85          } catch (Exception e) {
86              CommandLine.fail(e);
87          }
88      }
89  
90      private static void saveFile(File inputFile, String outputFilePattern, String urlTemplate)
91              throws IOException, SAXException, ParserConfigurationException {
92  
93          LOGGER.info("Input file: {}", inputFile);
94  
95          StringBuffer stringBuffer = new StringBuffer();
96          stringBuffer.append("<ROOT>\n");
97          stringBuffer.append(Files.toString(inputFile, Charsets.UTF_8));
98          stringBuffer.append("\n</ROOT>\n");
99  
100         InputStream is = new ByteArrayInputStream(stringBuffer.toString().getBytes());
101         DocumentBuilderFactory dbFactory = DocumentBuilderFactory.newInstance();
102         DocumentBuilder dBuilder = dbFactory.newDocumentBuilder();
103         Document doc = dBuilder.parse(is);
104 
105         doc.getDocumentElement().normalize();
106 
107         int i = 0;
108         for (Element element : JOOX.$(doc).find("DOC")) {
109             Element docnoElement = JOOX.$(element).find("DOCNO").get(0);
110             Element dateElement = JOOX.$(element).find("DATE").get(0);
111             Element correctionDateElement = JOOX.$(element).find("CORRECTION-DATE").get(0);
112             Element headlineElement = JOOX.$(element).find("HEADLINE").get(0);
113 
114             Element textElement = JOOX.$(element).find("TEXT").get(0);
115             Element graphicElement = JOOX.$(element).find("GRAPHIC").get(0);
116             Element correctionElement = JOOX.$(element).find("CORRECTION").get(0);
117 
118             // Incrementing also in case of errors
119             i++;
120             File outputFile = new File(outputFilePattern + "-" + i + ".naf");
121 
122             String text = "";
123             if (textElement != null) {
124                 text += textElement.getTextContent().trim() + "\n";
125             }
126             if (graphicElement != null) {
127                 text += graphicElement.getTextContent().trim() + "\n";
128             }
129             if (correctionElement != null) {
130                 text += correctionElement.getTextContent().trim() + "\n";
131             }
132 
133             text = text.trim();
134 
135             String headline = "";
136             if (headlineElement != null) {
137                 headline = headlineElement.getTextContent().trim();
138             }
139 
140             String docno = "";
141             if (docnoElement != null) {
142                 docno = docnoElement.getTextContent().trim();
143             }
144 
145             if (text.length() == 0 && headline.length() == 0) {
146                 LOGGER.error("TEXT and HEADLINE are both empty ({})", docno);
147                 continue;
148             }
149 
150             String date = "";
151             if (dateElement != null) {
152                 date = dateElement.getTextContent().trim();
153             }
154 
155             if (docno.equals("")) {
156                 LOGGER.error("DOCNO is empty");
157             }
158 
159             String url = String.format(urlTemplate, docno);
160 
161             headline = headline.replace('\n', ' ');
162             headline = headline.replaceAll("\\s+", " ");
163 
164             Date thisDate = null;
165             Matcher matcher = datePattern.matcher(date);
166             if (matcher.find()) {
167                 try {
168                     thisDate = format.parse(matcher.group(1));
169                 } catch (Exception e) {
170                     // ignored
171                 }
172             }
173             if (thisDate == null && correctionDateElement != null) {
174                 date = correctionDateElement.getTextContent().trim();
175                 matcher = datePattern.matcher(date);
176                 if (matcher.find()) {
177                     try {
178                         thisDate = format.parse(matcher.group(1));
179                     } catch (Exception e) {
180                         // ignored
181                     }
182                 }
183             }
184 
185             text = headline + "\n\n" + text;
186 
187             KAFDocument document = new KAFDocument("en", "v3");
188             document.setRawText(text);
189 
190             KAFDocument.FileDesc fileDesc = document.createFileDesc();
191             fileDesc.title = headline;
192             if (thisDate != null) {
193                 fileDesc.creationtime = sdf.format(thisDate);
194             }
195             KAFDocument.Public aPublic = document.createPublic();
196             aPublic.uri = url;
197             aPublic.publicId = docno;
198 
199             document.save(outputFile.getAbsolutePath());
200         }
201     }
202 }