1 package eu.fbk.dkm.pikes.resources.trec;
2
3 import com.google.common.base.Charsets;
4 import com.google.common.io.Files;
5 import eu.fbk.utils.core.CommandLine;
6 import ixa.kaflib.KAFDocument;
7 import org.apache.commons.io.FileUtils;
8 import org.joox.JOOX;
9 import org.slf4j.Logger;
10 import org.slf4j.LoggerFactory;
11 import org.w3c.dom.Document;
12 import org.w3c.dom.Element;
13 import org.xml.sax.SAXException;
14
15 import javax.xml.parsers.DocumentBuilder;
16 import javax.xml.parsers.DocumentBuilderFactory;
17 import javax.xml.parsers.ParserConfigurationException;
18 import java.io.*;
19 import java.nio.file.Path;
20 import java.nio.file.Paths;
21 import java.text.DateFormat;
22 import java.text.SimpleDateFormat;
23 import java.util.Date;
24 import java.util.Iterator;
25 import java.util.Locale;
26 import java.util.regex.Matcher;
27 import java.util.regex.Pattern;
28
29
30
31
32 public class Queries {
33
34
35 private static final Logger LOGGER = LoggerFactory.getLogger(Queries.class);
36 private static String DEFAULT_URL = "http://trec/query/";
37 private static SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd'T'HH:mm:ss.SSSZ");
38 private static DateFormat format = new SimpleDateFormat("MMMM d, yyyy", Locale.ENGLISH);
39 private static Pattern datePattern = Pattern.compile("^([a-zA-Z]+\\s+[0-9]+,\\s+[0-9]+)");
40
41 public static void main(String[] args) {
42
43 try {
44
45 final CommandLine cmd = CommandLine
46 .parser()
47 .withName("queries-extractor")
48 .withHeader("Extract Queries documents from TREC dataset and save them in NAF format")
49 .withOption("i", "input", "Input file", "FILE", CommandLine.Type.FILE, true,
50 false, true)
51 .withOption("o", "output", "Output folder", "FOLDER", CommandLine.Type.DIRECTORY, true, false, true)
52 .withOption("u", "url-template", "URL template (with %d for the ID)", "URL",
53 CommandLine.Type.STRING, true, false, false)
54 .withLogger(LoggerFactory.getLogger("eu.fbk"))
55 .parse(args);
56
57 File inputfile = cmd.getOptionValue("input", File.class);
58 File outputFolder = cmd.getOptionValue("output", File.class);
59
60 String urlTemplate = DEFAULT_URL;
61 if (cmd.hasOption("url-template")) {
62 urlTemplate = cmd.getOptionValue("url-template", String.class);
63 }
64
65 DocumentBuilderFactory dbFactory = DocumentBuilderFactory.newInstance();
66 DocumentBuilder dBuilder = dbFactory.newDocumentBuilder();
67
68 LOGGER.info(inputfile.getName());
69
70 String content = FileUtils.readFileToString(inputfile, Charsets.UTF_8);
71
72 StringBuffer newContent = new StringBuffer();
73 newContent.append("<root>\n");
74 newContent.append(content
75 .replaceAll("<title>", "</num>\n<title>")
76 .replaceAll("<desc>", "</title>\n<desc>")
77 .replaceAll("<narr>", "</desc>\n<narr>")
78 .replaceAll("</top>", "</narr>\n</top>")
79 .replaceAll("R&D", "R&D")
80 );
81 newContent.append("</root>\n");
82
83 Document doc = dBuilder.parse(new ByteArrayInputStream(newContent.toString().getBytes(Charsets.UTF_8)));
84 for (Element element : JOOX.$(doc).find("top")) {
85 Element numElement = JOOX.$(element).find("num").get(0);
86 Element titleElement = JOOX.$(element).find("title").get(0);
87 Element descElement = JOOX.$(element).find("desc").get(0);
88
89 String number = "q" + numElement.getTextContent().trim().substring(7).trim();
90
91 String title = titleElement.getTextContent().trim().replaceAll("\\s+", " ");
92 String desc = descElement.getTextContent().trim().substring(12).trim().replaceAll("\\s+", " ");
93
94 saveFile(outputFolder.getAbsolutePath() + "/keyword/" + number + ".naf", title, number, urlTemplate);
95 saveFile(outputFolder.getAbsolutePath() + "/desc/" + number + ".naf", desc, number, urlTemplate);
96 saveFile(outputFolder.getAbsolutePath() + "/keyword_desc/" + number + ".naf", title+"\n\n"+desc, number, urlTemplate);
97
98 }
99
100 } catch (Exception e) {
101 CommandLine.fail(e);
102 }
103 }
104
105
106 private static void saveFile(String outputFilename, String raw, String id, String url_template)
107 throws IOException, SAXException, ParserConfigurationException {
108
109 File file = new File(outputFilename);
110 file.getParentFile().mkdirs();
111
112 File outputFile = new File(outputFilename);
113
114 KAFDocument document = new KAFDocument("en", "v3");
115 document.setRawText(raw);
116
117 KAFDocument.FileDesc fileDesc = document.createFileDesc();
118 fileDesc.title = id;
119
120 KAFDocument.Public aPublic = document.createPublic();
121 aPublic.uri = url_template+id;
122 aPublic.publicId = id;
123
124 document.save(outputFile.getAbsolutePath());
125
126 }
127
128
129 }