1   package eu.fbk.dkm.pikes.resources.trec;
2   
3   import com.google.common.base.Charsets;
4   import eu.fbk.utils.core.CommandLine;
5   import org.apache.commons.io.FileUtils;
6   import org.joox.JOOX;
7   import org.slf4j.Logger;
8   import org.slf4j.LoggerFactory;
9   import org.w3c.dom.Document;
10  import org.w3c.dom.Element;
11  
12  import javax.xml.parsers.DocumentBuilder;
13  import javax.xml.parsers.DocumentBuilderFactory;
14  import java.io.BufferedWriter;
15  import java.io.ByteArrayInputStream;
16  import java.io.File;
17  import java.io.FileWriter;
18  import java.util.Iterator;
19  
20  /**
21   * Created by alessio on 15/12/15.
22   */
23  
24  public class QueriesTSV {
25  
26      private static final Logger LOGGER = LoggerFactory.getLogger(QueriesTSV.class);
27  //    private static String folder = "/Users/alessio/Documents/scripts/pikesir/test/trec/queries/";
28  //    private static String outputFile = "/Users/alessio/Documents/scripts/pikesir/test/trec/queries.tsv";
29  
30      public static void main(String[] args) {
31  
32          try {
33  
34              final CommandLine cmd = CommandLine
35                      .parser()
36                      .withName("trec-queriesTSV-converter")
37                      .withHeader("Convert TREC queries into TSV format")
38                      .withOption("i", "input", "Input folder", "FOLDER", CommandLine.Type.DIRECTORY_EXISTING, true,
39                              false, true)
40                      .withOption("o", "output", "Output file", "FILE", CommandLine.Type.FILE, true, false, true)
41                      .withLogger(LoggerFactory.getLogger("eu.fbk")) //
42                      .parse(args);
43  
44              File inputFolder = cmd.getOptionValue("input", File.class);
45              File outputFile = cmd.getOptionValue("output", File.class);
46  
47              DocumentBuilderFactory dbFactory = DocumentBuilderFactory.newInstance();
48              DocumentBuilder dBuilder = dbFactory.newDocumentBuilder();
49  
50              BufferedWriter writer = new BufferedWriter(new FileWriter(outputFile));
51  
52              Iterator<File> fileIterator = FileUtils.iterateFiles(inputFolder, null, true);
53              while (fileIterator.hasNext()) {
54                  File file = fileIterator.next();
55  
56                  LOGGER.info(file.getName());
57  
58                  String content = FileUtils.readFileToString(file, Charsets.UTF_8);
59  
60                  StringBuffer newContent = new StringBuffer();
61                  newContent.append("<root>\n");
62                  newContent.append(content
63                                  .replaceAll("<title>", "</num>\n<title>")
64                                  .replaceAll("<desc>", "</title>\n<desc>")
65                                  .replaceAll("<narr>", "</desc>\n<narr>")
66                                  .replaceAll("</top>", "</narr>\n</top>")
67                                  .replaceAll("R&D", "R&amp;D")
68                  );
69                  newContent.append("</root>\n");
70  
71                  Document doc = dBuilder.parse(new ByteArrayInputStream(newContent.toString().getBytes(Charsets.UTF_8)));
72                  for (Element element : JOOX.$(doc).find("top")) {
73                      Element numElement = JOOX.$(element).find("num").get(0);
74                      Element titleElement = JOOX.$(element).find("title").get(0);
75                      Element descElement = JOOX.$(element).find("desc").get(0);
76  
77                      String number = "q" + numElement.getTextContent().trim().substring(7).trim();
78                      String title = titleElement.getTextContent().trim().replaceAll("\\s+", " ");
79                      String desc = descElement.getTextContent().trim().substring(12).trim().replaceAll("\\s+", " ");
80  
81                      writer.append(number).append("\t");
82                      writer.append(title).append("\t");
83                      writer.append(desc).append("\n");
84                  }
85              }
86  
87              writer.close();
88  
89          } catch (Exception e) {
90              CommandLine.fail(e);
91          }
92      }
93  }