1   package eu.fbk.dkm.pikes.resources.trec;
2   
3   import com.google.common.base.Charsets;
4   import com.google.common.io.Files;
5   import eu.fbk.utils.core.CommandLine;
6   import ixa.kaflib.KAFDocument;
7   import org.joox.JOOX;
8   import org.slf4j.Logger;
9   import org.slf4j.LoggerFactory;
10  import org.w3c.dom.Document;
11  import org.w3c.dom.Element;
12  import org.xml.sax.SAXException;
13  
14  import javax.xml.parsers.DocumentBuilder;
15  import javax.xml.parsers.DocumentBuilderFactory;
16  import javax.xml.parsers.ParserConfigurationException;
17  import java.io.ByteArrayInputStream;
18  import java.io.File;
19  import java.io.IOException;
20  import java.io.InputStream;
21  import java.text.SimpleDateFormat;
22  import java.util.Calendar;
23  import java.util.regex.Matcher;
24  import java.util.regex.Pattern;
25  
26  /**
27   * Created by alessio on 27/11/15.
28   *
29   * Warning: some files are too big for the JDK
30   * - FR941202.2
31   *
32   * Solution: pass -DentityExpansionLimit=0 to the Java command
33   */
34  
35  public class FR94 {
36  
37      private static final Logger LOGGER = LoggerFactory.getLogger(FR94.class);
38      private static String DEFAULT_URL = "http://document/%s";
39  
40      public static void main(String[] args) {
41  
42          try {
43  
44              final CommandLine cmd = CommandLine
45                      .parser()
46                      .withName("fr94-extractor")
47                      .withHeader("Extract FR94 documents from TREC dataset and save them in NAF format")
48                      .withOption("i", "input", "Input folder", "FOLDER", CommandLine.Type.DIRECTORY_EXISTING, true,
49                              false, true)
50                      .withOption("o", "output", "Output folder", "FOLDER", CommandLine.Type.DIRECTORY, true, false, true)
51                      .withOption("u", "url-template", "URL template (with %d for the ID)", "URL",
52                              CommandLine.Type.STRING, true, false, false)
53                      .withLogger(LoggerFactory.getLogger("eu.fbk")) //
54                      .parse(args);
55  
56              File inputDir = cmd.getOptionValue("input", File.class);
57  
58              String urlTemplate = DEFAULT_URL;
59              if (cmd.hasOption("url-template")) {
60                  urlTemplate = cmd.getOptionValue("url-template", String.class);
61              }
62  
63              File outputDir = cmd.getOptionValue("output", File.class);
64              if (!outputDir.exists()) {
65                  outputDir.mkdirs();
66              }
67  
68              for (final File file : Files.fileTreeTraverser().preOrderTraversal(inputDir)) {
69                  if (!file.isFile()) {
70                      continue;
71                  }
72                  if (file.getName().startsWith(".")) {
73                      continue;
74                  }
75  
76                  String outputTemplate = outputDir.getAbsolutePath() + File.separator + file.getName();
77                  File newFolder = new File(outputTemplate);
78                  newFolder.mkdirs();
79  
80                  outputTemplate += File.separator + "NAF";
81                  saveFile(file, outputTemplate, urlTemplate);
82              }
83          } catch (Exception e) {
84              CommandLine.fail(e);
85          }
86      }
87  
88      private static void saveFile(File inputFile, String outputFilePattern, String urlTemplate)
89              throws IOException, SAXException, ParserConfigurationException {
90  
91          LOGGER.info("Input file: {}", inputFile);
92  
93          StringBuffer stringBuffer = new StringBuffer();
94          stringBuffer.append("<?xml version=\"1.0\"?>\n"
95                  + "<!DOCTYPE tutorials [\n");
96          stringBuffer.append("<!ENTITY hyph \"-\">\n");
97          stringBuffer.append("<!ENTITY blank \" \">\n");
98          stringBuffer.append("<!ENTITY sect \" \">\n");
99          stringBuffer.append("<!ENTITY para \" \">\n");
100         stringBuffer.append("<!ENTITY cir \" \">\n");
101         stringBuffer.append("<!ENTITY rsquo \" \">\n");
102         stringBuffer.append("<!ENTITY mu \" \">\n");
103         stringBuffer.append("<!ENTITY times \" \">\n");
104         stringBuffer.append("<!ENTITY bull \" \">\n");
105         stringBuffer.append("<!ENTITY ge \">=\">\n");
106         stringBuffer.append("<!ENTITY reg \" \">\n");
107         stringBuffer.append("<!ENTITY cent \" \">\n");
108         stringBuffer.append("<!ENTITY amp \" \">\n");
109         stringBuffer.append("<!ENTITY gt \">\">\n");
110         stringBuffer.append("<!ENTITY lt \"<\">\n");
111         stringBuffer.append("<!ENTITY acirc \"a\">\n");
112         stringBuffer.append("<!ENTITY ncirc \"n\">\n");
113         stringBuffer.append("<!ENTITY atilde \"a\">\n");
114         stringBuffer.append("<!ENTITY ntilde \"n\">\n");
115         stringBuffer.append("<!ENTITY otilde \"o\">\n");
116         stringBuffer.append("<!ENTITY utilde \"u\">\n");
117         stringBuffer.append("<!ENTITY aacute \"a\">\n");
118         stringBuffer.append("<!ENTITY cacute \"c\">\n");
119         stringBuffer.append("<!ENTITY eacute \"e\">\n");
120         stringBuffer.append("<!ENTITY Eacute \"E\">\n");
121         stringBuffer.append("<!ENTITY Gacute \"G\">\n");
122         stringBuffer.append("<!ENTITY iacute \"i\">\n");
123         stringBuffer.append("<!ENTITY lacute \"l\">\n");
124         stringBuffer.append("<!ENTITY nacute \"n\">\n");
125         stringBuffer.append("<!ENTITY oacute \"o\">\n");
126         stringBuffer.append("<!ENTITY pacute \"p\">\n");
127         stringBuffer.append("<!ENTITY racute \"r\">\n");
128         stringBuffer.append("<!ENTITY sacute \"s\">\n");
129         stringBuffer.append("<!ENTITY uacute \"u\">\n");
130         stringBuffer.append("<!ENTITY ocirc \"o\">\n");
131         stringBuffer.append("<!ENTITY auml \"a\">\n");
132         stringBuffer.append("<!ENTITY euml \"e\">\n");
133         stringBuffer.append("<!ENTITY Euml \"E\">\n");
134         stringBuffer.append("<!ENTITY iuml \"i\">\n");
135         stringBuffer.append("<!ENTITY Iuml \"I\">\n");
136         stringBuffer.append("<!ENTITY Kuml \"K\">\n");
137         stringBuffer.append("<!ENTITY Ouml \"O\">\n");
138         stringBuffer.append("<!ENTITY ouml \"o\">\n");
139         stringBuffer.append("<!ENTITY uuml \"u\">\n");
140         stringBuffer.append("<!ENTITY Ccedil \"C\">\n");
141         stringBuffer.append("<!ENTITY ccedil \"c\">\n");
142         stringBuffer.append("<!ENTITY agrave \"a\">\n");
143         stringBuffer.append("<!ENTITY Agrave \"A\">\n");
144         stringBuffer.append("<!ENTITY egrave \"e\">\n");
145         stringBuffer.append("<!ENTITY Egrave \"E\">\n");
146         stringBuffer.append("<!ENTITY igrave \"i\">\n");
147         stringBuffer.append("<!ENTITY Ograve \"O\">\n");
148         stringBuffer.append("<!ENTITY ograve \"o\">\n");
149         stringBuffer.append("<!ENTITY ugrave \"u\">\n");
150         stringBuffer.append("]>\n");
151         stringBuffer.append("<ROOT>\n");
152         stringBuffer.append(Files.toString(inputFile, Charsets.UTF_8));
153         stringBuffer.append("\n</ROOT>\n");
154 
155         InputStream is = new ByteArrayInputStream(stringBuffer.toString().getBytes());
156         DocumentBuilderFactory dbFactory = DocumentBuilderFactory.newInstance();
157         DocumentBuilder dBuilder = dbFactory.newDocumentBuilder();
158         Document doc = dBuilder.parse(is);
159 
160         doc.getDocumentElement().normalize();
161 
162         int i = 0;
163         for (Element element : JOOX.$(doc).find("DOC")) {
164             Element docnoElement = JOOX.$(element).find("DOCNO").get(0);
165             Element textElement = JOOX.$(element).find("TEXT").get(0);
166 
167             // Incrementing also in case of errors
168             i++;
169             File outputFile = new File(outputFilePattern + "-" + i + ".naf");
170 
171             if (textElement == null) {
172                 LOGGER.error("TEXT is null");
173                 continue;
174             }
175 
176             String text = textElement.getTextContent().trim();
177 
178             String docno = "";
179             if (docnoElement != null) {
180                 docno = docnoElement.getTextContent().trim();
181             }
182 
183             if (docno.equals("")) {
184                 LOGGER.error("DOCNO is empty");
185             }
186 
187             String url = String.format(urlTemplate, docno);
188 
189             text = text.replaceAll("([^\\n])\\n([^\\n])", "$1 $2");
190             text = text.replaceAll("\\n+([a-z])", " $1");
191 
192             KAFDocument document = new KAFDocument("en", "v3");
193             document.setRawText(text);
194 
195             KAFDocument.FileDesc fileDesc = document.createFileDesc();
196             fileDesc.title = docno;
197             KAFDocument.Public aPublic = document.createPublic();
198             aPublic.uri = url;
199             aPublic.publicId = docno;
200 
201             document.save(outputFile.getAbsolutePath());
202         }
203     }
204 }