1   package eu.fbk.dkm.pikes.resources.trec;
2   
3   import com.google.common.base.Charsets;
4   import com.google.common.io.Files;
5   import eu.fbk.utils.core.CommandLine;
6   import ixa.kaflib.KAFDocument;
7   import org.joox.JOOX;
8   import org.slf4j.Logger;
9   import org.slf4j.LoggerFactory;
10  import org.w3c.dom.Document;
11  import org.w3c.dom.Element;
12  import org.xml.sax.SAXException;
13  
14  import javax.xml.parsers.DocumentBuilder;
15  import javax.xml.parsers.DocumentBuilderFactory;
16  import javax.xml.parsers.ParserConfigurationException;
17  import java.io.ByteArrayInputStream;
18  import java.io.File;
19  import java.io.IOException;
20  import java.io.InputStream;
21  import java.text.DateFormat;
22  import java.text.SimpleDateFormat;
23  import java.util.Calendar;
24  import java.util.Date;
25  import java.util.Locale;
26  
27  /**
28   * Created by alessio on 27/11/15.
29   *
30   * Warning: inconsistencies
31   * - FB496073 (lines 5212, 5366)
32   * - FB496111 (line 21480)
33   * - FB496246 (line 9252)
34   */
35  
36  public class FBIS {
37  
38      private static final Logger LOGGER = LoggerFactory.getLogger(FBIS.class);
39      private static String DEFAULT_URL = "http://document/%s";
40      private static SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd'T'HH:mm:ss.SSSZ");
41      private static DateFormat format = new SimpleDateFormat("d MMMM yyyy", Locale.ENGLISH);
42  
43      public static void main(String[] args) {
44  
45          try {
46  
47              final CommandLine cmd = CommandLine
48                      .parser()
49                      .withName("fbis-extractor")
50                      .withHeader("Extract FBIS documents from TREC dataset and save them in NAF format")
51                      .withOption("i", "input", "Input folder", "FOLDER", CommandLine.Type.DIRECTORY_EXISTING, true,
52                              false, true)
53                      .withOption("o", "output", "Output folder", "FOLDER", CommandLine.Type.DIRECTORY, true, false, true)
54                      .withOption("u", "url-template", "URL template (with %d for the ID)", "URL",
55                              CommandLine.Type.STRING, true, false, false)
56                      .withLogger(LoggerFactory.getLogger("eu.fbk")) //
57                      .parse(args);
58  
59              File inputDir = cmd.getOptionValue("input", File.class);
60  
61              String urlTemplate = DEFAULT_URL;
62              if (cmd.hasOption("url-template")) {
63                  urlTemplate = cmd.getOptionValue("url-template", String.class);
64              }
65  
66              File outputDir = cmd.getOptionValue("output", File.class);
67              if (!outputDir.exists()) {
68                  outputDir.mkdirs();
69              }
70  
71              for (final File file : Files.fileTreeTraverser().preOrderTraversal(inputDir)) {
72                  if (!file.isFile()) {
73                      continue;
74                  }
75                  if (file.getName().startsWith(".")) {
76                      continue;
77                  }
78  
79                  String outputTemplate = outputDir.getAbsolutePath() + File.separator + file.getName();
80                  File newFolder = new File(outputTemplate);
81                  newFolder.mkdirs();
82  
83                  outputTemplate += File.separator + "NAF";
84                  saveFile(file, outputTemplate, urlTemplate);
85              }
86          } catch (Exception e) {
87              CommandLine.fail(e);
88          }
89      }
90  
91      private static void saveFile(File inputFile, String outputFilePattern, String urlTemplate)
92              throws IOException, SAXException, ParserConfigurationException {
93  
94          LOGGER.info("Input file: {}", inputFile);
95  
96          StringBuffer stringBuffer = new StringBuffer();
97          stringBuffer.append("<?xml version=\"1.0\"?>\n"
98                  + "<!DOCTYPE tutorials [\n");
99          stringBuffer.append("<!ENTITY amp \" \">\n");
100         stringBuffer.append("<!ENTITY gt \" \">\n");
101         stringBuffer.append("<!ENTITY lt \" \">\n");
102         stringBuffer.append("<!ENTITY AElig \"A\">\n");
103         stringBuffer.append("<!ENTITY ap \" \">\n");
104         stringBuffer.append("<!ENTITY deg \" \">\n");
105         stringBuffer.append("<!ENTITY egrave \"e\">\n");
106         stringBuffer.append("<!ENTITY eacute \"e\">\n");
107         stringBuffer.append("<!ENTITY oacute \"o\">\n");
108         stringBuffer.append("<!ENTITY ubreve \"u\">\n");
109         stringBuffer.append("<!ENTITY Ubreve \"U\">\n");
110         stringBuffer.append("<!ENTITY egs \" \">\n");
111         stringBuffer.append("<!ENTITY els \" \">\n");
112         stringBuffer.append("<!ENTITY percnt \" \">\n");
113         stringBuffer.append("<!ENTITY pound \"£\">\n");
114         stringBuffer.append("<!ENTITY yen \"¥\">\n");
115         stringBuffer.append("<!ENTITY agr \"\">\n");
116         stringBuffer.append("<!ENTITY bgr \"\">\n");
117         stringBuffer.append("<!ENTITY dgr \"\">\n");
118         stringBuffer.append("<!ENTITY egr \"\">\n");
119         stringBuffer.append("<!ENTITY ggr \"\">\n");
120         stringBuffer.append("<!ENTITY Ggr \"\">\n");
121         stringBuffer.append("<!ENTITY kgr \"\">\n");
122         stringBuffer.append("<!ENTITY lgr \"\">\n");
123         stringBuffer.append("<!ENTITY mgr \"\">\n");
124         stringBuffer.append("<!ENTITY pgr \"\">\n");
125         stringBuffer.append("<!ENTITY rgr \"\">\n");
126         stringBuffer.append("<!ENTITY sgr \"\">\n");
127         stringBuffer.append("<!ENTITY tgr \"\">\n");
128         stringBuffer.append("<!ENTITY xgr \"\">\n");
129         stringBuffer.append("<!ENTITY zgr \"\">\n");
130         stringBuffer.append("<!ENTITY eegr \"\">\n");
131         stringBuffer.append("<!ENTITY khgr \"\">\n");
132         stringBuffer.append("<!ENTITY phgr \"\">\n");
133         stringBuffer.append("<!ENTITY thgr \"\">\n");
134         stringBuffer.append("<!ENTITY ohm \"\">\n");
135         stringBuffer.append("<!ENTITY Bgr \"\">\n");
136         stringBuffer.append("<!ENTITY Ngr \"\">\n");
137         stringBuffer.append("<!ENTITY EEgr \"\">\n");
138         stringBuffer.append("<!ENTITY OHgr \"\">\n");
139         stringBuffer.append("<!ENTITY PSgr \"\">\n");
140         stringBuffer.append("<!ENTITY Omacr \"\">\n");
141         stringBuffer.append("]>\n");
142         stringBuffer.append("<ROOT>\n");
143         stringBuffer.append(Files.toString(inputFile, Charsets.UTF_8)
144                 .replaceAll("<F P=[0-9]+>", "<F>")
145                 .replaceAll("<FIG ID=[^>]+>", "<FIG>")
146                 .replaceAll("</?3>", "")
147         );
148         stringBuffer.append("\n</ROOT>\n");
149 
150         InputStream is = new ByteArrayInputStream(stringBuffer.toString().getBytes());
151         DocumentBuilderFactory dbFactory = DocumentBuilderFactory.newInstance();
152         DocumentBuilder dBuilder = dbFactory.newDocumentBuilder();
153         Document doc = dBuilder.parse(is);
154 
155         doc.getDocumentElement().normalize();
156 
157         int i = 0;
158         for (Element element : JOOX.$(doc).find("DOC")) {
159             Element docnoElement = JOOX.$(element).find("DOCNO").get(0);
160             Element dateElement = JOOX.$(element).find("DATE1").get(0);
161             Element headlineElement = JOOX.$(element).find("TI").get(0);
162             Element textElement = JOOX.$(element).find("TEXT").get(0);
163 
164             // Incrementing also in case of errors
165             i++;
166             File outputFile = new File(outputFilePattern + "-" + i + ".naf");
167 
168             if (textElement == null) {
169                 LOGGER.error("TEXT is null");
170                 continue;
171             }
172 
173             String text = JOOX.$(element).find("TEXT").content();
174             if (text.length() == 0) {
175                 LOGGER.error("TEXT is empty");
176                 continue;
177             }
178 
179             String docno = "";
180             if (docnoElement != null) {
181                 docno = docnoElement.getTextContent().trim();
182             }
183 
184             String date = "";
185             if (dateElement != null) {
186                 date = dateElement.getTextContent().trim();
187             }
188 
189             String headline = "";
190             if (headlineElement != null) {
191                 headline = headlineElement.getTextContent().trim();
192             }
193 
194             if (docno.equals("")) {
195                 LOGGER.error("DOCNO is empty");
196             }
197 
198             String url = String.format(urlTemplate, docno);
199 
200             headline = headline.replace('\n', ' ');
201             headline = headline.replaceAll("\\s+", " ");
202             text = text.replace('\n', ' ');
203             text = text.replaceAll("\\s+", " ");
204 
205             Date thisDate = null;
206             try {
207                 thisDate = format.parse(date);
208             } catch (Exception e) {
209                 // ignored
210             }
211 
212             text = headline + "\n\n" + text;
213 
214             KAFDocument document = new KAFDocument("en", "v3");
215             document.setRawText(text);
216 
217             KAFDocument.FileDesc fileDesc = document.createFileDesc();
218             fileDesc.title = headline;
219             if (thisDate != null) {
220                 fileDesc.creationtime = sdf.format(thisDate);
221             }
222             KAFDocument.Public aPublic = document.createPublic();
223             aPublic.uri = url;
224             aPublic.publicId = docno;
225 
226             document.save(outputFile.getAbsolutePath());
227         }
228     }
229 }