1   package eu.fbk.dkm.pikes.resources.trec;
2   
3   import com.google.common.base.Charsets;
4   import com.google.common.io.Files;
5   import eu.fbk.utils.core.CommandLine;
6   import ixa.kaflib.KAFDocument;
7   import org.apache.commons.io.FileUtils;
8   import org.joox.JOOX;
9   import org.slf4j.Logger;
10  import org.slf4j.LoggerFactory;
11  import org.w3c.dom.Document;
12  import org.w3c.dom.Element;
13  import org.xml.sax.SAXException;
14  
15  import javax.xml.parsers.DocumentBuilder;
16  import javax.xml.parsers.DocumentBuilderFactory;
17  import javax.xml.parsers.ParserConfigurationException;
18  import java.io.*;
19  import java.nio.file.Path;
20  import java.nio.file.Paths;
21  import java.text.DateFormat;
22  import java.text.SimpleDateFormat;
23  import java.util.Date;
24  import java.util.Iterator;
25  import java.util.Locale;
26  import java.util.regex.Matcher;
27  import java.util.regex.Pattern;
28  
29  /**
30   * Created by marcorospocher on 10/05/16.
31   */
32  public class Queries {
33  
34  
35      private static final Logger LOGGER = LoggerFactory.getLogger(Queries.class);
36      private static String DEFAULT_URL = "http://trec/query/";
37      private static SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd'T'HH:mm:ss.SSSZ");
38      private static DateFormat format = new SimpleDateFormat("MMMM d, yyyy", Locale.ENGLISH);
39      private static Pattern datePattern = Pattern.compile("^([a-zA-Z]+\\s+[0-9]+,\\s+[0-9]+)");
40  
41      public static void main(String[] args) {
42  
43          try {
44  
45              final CommandLine cmd = CommandLine
46                      .parser()
47                      .withName("queries-extractor")
48                      .withHeader("Extract Queries documents from TREC dataset and save them in NAF format")
49                      .withOption("i", "input", "Input file", "FILE", CommandLine.Type.FILE, true,
50                              false, true)
51                      .withOption("o", "output", "Output folder", "FOLDER", CommandLine.Type.DIRECTORY, true, false, true)
52                      .withOption("u", "url-template", "URL template (with %d for the ID)", "URL",
53                              CommandLine.Type.STRING, true, false, false)
54                      .withLogger(LoggerFactory.getLogger("eu.fbk")) //
55                      .parse(args);
56  
57              File inputfile = cmd.getOptionValue("input", File.class);
58              File outputFolder = cmd.getOptionValue("output", File.class);
59  
60              String urlTemplate = DEFAULT_URL;
61              if (cmd.hasOption("url-template")) {
62                  urlTemplate = cmd.getOptionValue("url-template", String.class);
63              }
64  
65              DocumentBuilderFactory dbFactory = DocumentBuilderFactory.newInstance();
66              DocumentBuilder dBuilder = dbFactory.newDocumentBuilder();
67  
68              LOGGER.info(inputfile.getName());
69  
70              String content = FileUtils.readFileToString(inputfile, Charsets.UTF_8);
71  
72              StringBuffer newContent = new StringBuffer();
73              newContent.append("<root>\n");
74              newContent.append(content
75                      .replaceAll("<title>", "</num>\n<title>")
76                      .replaceAll("<desc>", "</title>\n<desc>")
77                      .replaceAll("<narr>", "</desc>\n<narr>")
78                      .replaceAll("</top>", "</narr>\n</top>")
79                      .replaceAll("R&D", "R&amp;D")
80              );
81              newContent.append("</root>\n");
82  
83              Document doc = dBuilder.parse(new ByteArrayInputStream(newContent.toString().getBytes(Charsets.UTF_8)));
84              for (Element element : JOOX.$(doc).find("top")) {
85                  Element numElement = JOOX.$(element).find("num").get(0);
86                  Element titleElement = JOOX.$(element).find("title").get(0);
87                  Element descElement = JOOX.$(element).find("desc").get(0);
88  
89                  String number = "q" + numElement.getTextContent().trim().substring(7).trim();
90                  //String title = titleElement.getTextContent().trim().replaceAll("\\s+", " ");
91                  String title = titleElement.getTextContent().trim().replaceAll("\\s+", " ");
92                  String desc = descElement.getTextContent().trim().substring(12).trim().replaceAll("\\s+", " ");
93  
94                  saveFile(outputFolder.getAbsolutePath() + "/keyword/" + number + ".naf", title, number, urlTemplate);
95                  saveFile(outputFolder.getAbsolutePath() + "/desc/" + number + ".naf", desc, number, urlTemplate);
96                  saveFile(outputFolder.getAbsolutePath() + "/keyword_desc/" + number + ".naf", title+"\n\n"+desc, number, urlTemplate);
97  
98              }
99  
100         } catch (Exception e) {
101             CommandLine.fail(e);
102         }
103     }
104 
105 
106     private static void saveFile(String outputFilename, String raw, String id, String url_template)
107             throws IOException, SAXException, ParserConfigurationException {
108 
109         File file = new File(outputFilename);
110         file.getParentFile().mkdirs();
111 
112         File outputFile = new File(outputFilename);
113 
114         KAFDocument document = new KAFDocument("en", "v3");
115         document.setRawText(raw);
116 
117         KAFDocument.FileDesc fileDesc = document.createFileDesc();
118         fileDesc.title = id;
119 
120         KAFDocument.Public aPublic = document.createPublic();
121         aPublic.uri = url_template+id;
122         aPublic.publicId = id;
123 
124         document.save(outputFile.getAbsolutePath());
125 
126     }
127 
128 
129 }