1 package eu.fbk.dkm.pikes.resources.trec;
2
3 import com.google.common.base.Charsets;
4 import eu.fbk.utils.core.CommandLine;
5 import org.apache.commons.io.FileUtils;
6 import org.joox.JOOX;
7 import org.slf4j.Logger;
8 import org.slf4j.LoggerFactory;
9 import org.w3c.dom.Document;
10 import org.w3c.dom.Element;
11
12 import javax.xml.parsers.DocumentBuilder;
13 import javax.xml.parsers.DocumentBuilderFactory;
14 import java.io.BufferedWriter;
15 import java.io.ByteArrayInputStream;
16 import java.io.File;
17 import java.io.FileWriter;
18 import java.util.Iterator;
19
20
21
22
23
24 public class QueriesTSV {
25
26 private static final Logger LOGGER = LoggerFactory.getLogger(QueriesTSV.class);
27
28
29
30 public static void main(String[] args) {
31
32 try {
33
34 final CommandLine cmd = CommandLine
35 .parser()
36 .withName("trec-queriesTSV-converter")
37 .withHeader("Convert TREC queries into TSV format")
38 .withOption("i", "input", "Input folder", "FOLDER", CommandLine.Type.DIRECTORY_EXISTING, true,
39 false, true)
40 .withOption("o", "output", "Output file", "FILE", CommandLine.Type.FILE, true, false, true)
41 .withLogger(LoggerFactory.getLogger("eu.fbk"))
42 .parse(args);
43
44 File inputFolder = cmd.getOptionValue("input", File.class);
45 File outputFile = cmd.getOptionValue("output", File.class);
46
47 DocumentBuilderFactory dbFactory = DocumentBuilderFactory.newInstance();
48 DocumentBuilder dBuilder = dbFactory.newDocumentBuilder();
49
50 BufferedWriter writer = new BufferedWriter(new FileWriter(outputFile));
51
52 Iterator<File> fileIterator = FileUtils.iterateFiles(inputFolder, null, true);
53 while (fileIterator.hasNext()) {
54 File file = fileIterator.next();
55
56 LOGGER.info(file.getName());
57
58 String content = FileUtils.readFileToString(file, Charsets.UTF_8);
59
60 StringBuffer newContent = new StringBuffer();
61 newContent.append("<root>\n");
62 newContent.append(content
63 .replaceAll("<title>", "</num>\n<title>")
64 .replaceAll("<desc>", "</title>\n<desc>")
65 .replaceAll("<narr>", "</desc>\n<narr>")
66 .replaceAll("</top>", "</narr>\n</top>")
67 .replaceAll("R&D", "R&D")
68 );
69 newContent.append("</root>\n");
70
71 Document doc = dBuilder.parse(new ByteArrayInputStream(newContent.toString().getBytes(Charsets.UTF_8)));
72 for (Element element : JOOX.$(doc).find("top")) {
73 Element numElement = JOOX.$(element).find("num").get(0);
74 Element titleElement = JOOX.$(element).find("title").get(0);
75 Element descElement = JOOX.$(element).find("desc").get(0);
76
77 String number = "q" + numElement.getTextContent().trim().substring(7).trim();
78 String title = titleElement.getTextContent().trim().replaceAll("\\s+", " ");
79 String desc = descElement.getTextContent().trim().substring(12).trim().replaceAll("\\s+", " ");
80
81 writer.append(number).append("\t");
82 writer.append(title).append("\t");
83 writer.append(desc).append("\n");
84 }
85 }
86
87 writer.close();
88
89 } catch (Exception e) {
90 CommandLine.fail(e);
91 }
92 }
93 }