1   package eu.fbk.dkm.pikes.resources.trec;
2   
3   import com.google.common.base.Charsets;
4   import com.google.common.io.Files;
5   import eu.fbk.utils.core.CommandLine;
6   import ixa.kaflib.KAFDocument;
7   import org.joox.JOOX;
8   import org.slf4j.Logger;
9   import org.slf4j.LoggerFactory;
10  import org.w3c.dom.Document;
11  import org.w3c.dom.Element;
12  import org.xml.sax.SAXException;
13  
14  import javax.xml.parsers.DocumentBuilder;
15  import javax.xml.parsers.DocumentBuilderFactory;
16  import javax.xml.parsers.ParserConfigurationException;
17  import java.io.ByteArrayInputStream;
18  import java.io.File;
19  import java.io.IOException;
20  import java.io.InputStream;
21  import java.text.SimpleDateFormat;
22  import java.util.Calendar;
23  import java.util.regex.Matcher;
24  import java.util.regex.Pattern;
25  
26  /**
27   * Created by alessio on 27/11/15.
28   */
29  
30  public class FT {
31  
32      private static final Logger LOGGER = LoggerFactory.getLogger(FT.class);
33      private static String DEFAULT_URL = "http://document/%s";
34      private static SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd'T'HH:mm:ss.SSSZ");
35  
36      private static Pattern TITLE_PATTERN = Pattern.compile("FT [A-Za-z0-9- ]+ / (\\([^\\(\\)]*\\))?(.*)");
37  
38      public static void main(String[] args) {
39  
40          try {
41  
42              final CommandLine cmd = CommandLine
43                      .parser()
44                      .withName("ft-extractor")
45                      .withHeader("Extract FT documents from TREC dataset and save them in NAF format")
46                      .withOption("i", "input", "Input folder", "FOLDER", CommandLine.Type.DIRECTORY_EXISTING, true,
47                              false, true)
48                      .withOption("o", "output", "Output folder", "FOLDER", CommandLine.Type.DIRECTORY, true, false, true)
49                      .withOption("u", "url-template", "URL template (with %d for the ID)", "URL",
50                              CommandLine.Type.STRING, true, false, false)
51                      .withLogger(LoggerFactory.getLogger("eu.fbk")) //
52                      .parse(args);
53  
54              File inputDir = cmd.getOptionValue("input", File.class);
55  
56              String urlTemplate = DEFAULT_URL;
57              if (cmd.hasOption("url-template")) {
58                  urlTemplate = cmd.getOptionValue("url-template", String.class);
59              }
60  
61              File outputDir = cmd.getOptionValue("output", File.class);
62              if (!outputDir.exists()) {
63                  outputDir.mkdirs();
64              }
65  
66              for (final File file : Files.fileTreeTraverser().preOrderTraversal(inputDir)) {
67                  if (!file.isFile()) {
68                      continue;
69                  }
70  
71                  String outputTemplate = outputDir.getAbsolutePath() + File.separator + file.getName();
72                  File newFolder = new File(outputTemplate);
73                  newFolder.mkdirs();
74  
75                  outputTemplate += File.separator + "NAF";
76                  saveFile(file, outputTemplate, urlTemplate);
77              }
78          } catch (Exception e) {
79              CommandLine.fail(e);
80          }
81      }
82  
83      private static void saveFile(File inputFile, String outputFilePattern, String urlTemplate)
84              throws IOException, SAXException, ParserConfigurationException {
85  
86          LOGGER.info("Input file: {}", inputFile);
87  
88          StringBuffer stringBuffer = new StringBuffer();
89          stringBuffer.append("<ROOT>\n");
90          stringBuffer.append(Files.toString(inputFile, Charsets.UTF_8));
91          stringBuffer.append("\n</ROOT>\n");
92  
93          InputStream is = new ByteArrayInputStream(stringBuffer.toString().getBytes());
94          DocumentBuilderFactory dbFactory = DocumentBuilderFactory.newInstance();
95          DocumentBuilder dBuilder = dbFactory.newDocumentBuilder();
96          Document doc = dBuilder.parse(is);
97  
98          doc.getDocumentElement().normalize();
99  
100         int i = 0;
101         for (Element element : JOOX.$(doc).find("DOC")) {
102             Element docnoElement = JOOX.$(element).find("DOCNO").get(0);
103             Element dateElement = JOOX.$(element).find("DATE").get(0);
104             Element headlineElement = JOOX.$(element).find("HEADLINE").get(0);
105             Element textElement = JOOX.$(element).find("TEXT").get(0);
106 
107             // Incrementing also in case of errors
108             i++;
109             File outputFile = new File(outputFilePattern + "-" + i + ".naf");
110 
111             if (textElement == null) {
112                 LOGGER.error("TEXT is null");
113                 continue;
114             }
115 
116             String text = textElement.getTextContent().trim();
117 
118             String docno = "";
119             if (docnoElement != null) {
120                 docno = docnoElement.getTextContent().trim();
121             }
122 
123             String date = "";
124             if (dateElement != null) {
125                 date = dateElement.getTextContent().trim();
126             }
127 
128             String headline = "";
129             if (headlineElement != null) {
130                 headline = headlineElement.getTextContent().trim();
131             }
132 
133             if (docno.equals("")) {
134                 LOGGER.error("DOCNO is empty");
135             }
136 
137             String url = String.format(urlTemplate, docno);
138 
139             headline = headline.replace('\n', ' ');
140             headline = headline.replaceAll("\\s+", " ");
141             text = text.replace('\n', ' ');
142             text = text.replaceAll("\\s+", " ");
143 
144             Matcher matcher = TITLE_PATTERN.matcher(headline);
145             if (matcher.find()) {
146                 headline = matcher.group(2).trim();
147             }
148 
149             Calendar.Builder builder = new Calendar.Builder();
150             try {
151                 builder.setDate(1900 + Integer.parseInt(date.substring(0, 2)), Integer.parseInt(date.substring(2, 4)),
152                         Integer.parseInt(date.substring(4)));
153             } catch (NumberFormatException e) {
154                 LOGGER.error(e.getMessage());
155             }
156             Calendar calendar = builder.build();
157 
158             text = headline + "\n\n" + text;
159 
160             KAFDocument document = new KAFDocument("en", "v3");
161             document.setRawText(text);
162 
163             KAFDocument.FileDesc fileDesc = document.createFileDesc();
164             fileDesc.title = headline;
165             fileDesc.creationtime = sdf.format(calendar.getTime());
166             KAFDocument.Public aPublic = document.createPublic();
167             aPublic.uri = url;
168             aPublic.publicId = docno;
169 
170             document.save(outputFile.getAbsolutePath());
171         }
172     }
173 }