1 package eu.fbk.dkm.pikes.resources.trec;
2
3 import com.google.common.base.Charsets;
4 import com.google.common.io.Files;
5 import eu.fbk.utils.core.CommandLine;
6 import ixa.kaflib.KAFDocument;
7 import org.joox.JOOX;
8 import org.slf4j.Logger;
9 import org.slf4j.LoggerFactory;
10 import org.w3c.dom.Document;
11 import org.w3c.dom.Element;
12 import org.xml.sax.SAXException;
13
14 import javax.xml.parsers.DocumentBuilder;
15 import javax.xml.parsers.DocumentBuilderFactory;
16 import javax.xml.parsers.ParserConfigurationException;
17 import java.io.ByteArrayInputStream;
18 import java.io.File;
19 import java.io.IOException;
20 import java.io.InputStream;
21 import java.text.SimpleDateFormat;
22 import java.util.Calendar;
23 import java.util.regex.Matcher;
24 import java.util.regex.Pattern;
25
26
27
28
29
30 public class FT {
31
32 private static final Logger LOGGER = LoggerFactory.getLogger(FT.class);
33 private static String DEFAULT_URL = "http://document/%s";
34 private static SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd'T'HH:mm:ss.SSSZ");
35
36 private static Pattern TITLE_PATTERN = Pattern.compile("FT [A-Za-z0-9- ]+ / (\\([^\\(\\)]*\\))?(.*)");
37
38 public static void main(String[] args) {
39
40 try {
41
42 final CommandLine cmd = CommandLine
43 .parser()
44 .withName("ft-extractor")
45 .withHeader("Extract FT documents from TREC dataset and save them in NAF format")
46 .withOption("i", "input", "Input folder", "FOLDER", CommandLine.Type.DIRECTORY_EXISTING, true,
47 false, true)
48 .withOption("o", "output", "Output folder", "FOLDER", CommandLine.Type.DIRECTORY, true, false, true)
49 .withOption("u", "url-template", "URL template (with %d for the ID)", "URL",
50 CommandLine.Type.STRING, true, false, false)
51 .withLogger(LoggerFactory.getLogger("eu.fbk"))
52 .parse(args);
53
54 File inputDir = cmd.getOptionValue("input", File.class);
55
56 String urlTemplate = DEFAULT_URL;
57 if (cmd.hasOption("url-template")) {
58 urlTemplate = cmd.getOptionValue("url-template", String.class);
59 }
60
61 File outputDir = cmd.getOptionValue("output", File.class);
62 if (!outputDir.exists()) {
63 outputDir.mkdirs();
64 }
65
66 for (final File file : Files.fileTreeTraverser().preOrderTraversal(inputDir)) {
67 if (!file.isFile()) {
68 continue;
69 }
70
71 String outputTemplate = outputDir.getAbsolutePath() + File.separator + file.getName();
72 File newFolder = new File(outputTemplate);
73 newFolder.mkdirs();
74
75 outputTemplate += File.separator + "NAF";
76 saveFile(file, outputTemplate, urlTemplate);
77 }
78 } catch (Exception e) {
79 CommandLine.fail(e);
80 }
81 }
82
83 private static void saveFile(File inputFile, String outputFilePattern, String urlTemplate)
84 throws IOException, SAXException, ParserConfigurationException {
85
86 LOGGER.info("Input file: {}", inputFile);
87
88 StringBuffer stringBuffer = new StringBuffer();
89 stringBuffer.append("<ROOT>\n");
90 stringBuffer.append(Files.toString(inputFile, Charsets.UTF_8));
91 stringBuffer.append("\n</ROOT>\n");
92
93 InputStream is = new ByteArrayInputStream(stringBuffer.toString().getBytes());
94 DocumentBuilderFactory dbFactory = DocumentBuilderFactory.newInstance();
95 DocumentBuilder dBuilder = dbFactory.newDocumentBuilder();
96 Document doc = dBuilder.parse(is);
97
98 doc.getDocumentElement().normalize();
99
100 int i = 0;
101 for (Element element : JOOX.$(doc).find("DOC")) {
102 Element docnoElement = JOOX.$(element).find("DOCNO").get(0);
103 Element dateElement = JOOX.$(element).find("DATE").get(0);
104 Element headlineElement = JOOX.$(element).find("HEADLINE").get(0);
105 Element textElement = JOOX.$(element).find("TEXT").get(0);
106
107
108 i++;
109 File outputFile = new File(outputFilePattern + "-" + i + ".naf");
110
111 if (textElement == null) {
112 LOGGER.error("TEXT is null");
113 continue;
114 }
115
116 String text = textElement.getTextContent().trim();
117
118 String docno = "";
119 if (docnoElement != null) {
120 docno = docnoElement.getTextContent().trim();
121 }
122
123 String date = "";
124 if (dateElement != null) {
125 date = dateElement.getTextContent().trim();
126 }
127
128 String headline = "";
129 if (headlineElement != null) {
130 headline = headlineElement.getTextContent().trim();
131 }
132
133 if (docno.equals("")) {
134 LOGGER.error("DOCNO is empty");
135 }
136
137 String url = String.format(urlTemplate, docno);
138
139 headline = headline.replace('\n', ' ');
140 headline = headline.replaceAll("\\s+", " ");
141 text = text.replace('\n', ' ');
142 text = text.replaceAll("\\s+", " ");
143
144 Matcher matcher = TITLE_PATTERN.matcher(headline);
145 if (matcher.find()) {
146 headline = matcher.group(2).trim();
147 }
148
149 Calendar.Builder builder = new Calendar.Builder();
150 try {
151 builder.setDate(1900 + Integer.parseInt(date.substring(0, 2)), Integer.parseInt(date.substring(2, 4)),
152 Integer.parseInt(date.substring(4)));
153 } catch (NumberFormatException e) {
154 LOGGER.error(e.getMessage());
155 }
156 Calendar calendar = builder.build();
157
158 text = headline + "\n\n" + text;
159
160 KAFDocument document = new KAFDocument("en", "v3");
161 document.setRawText(text);
162
163 KAFDocument.FileDesc fileDesc = document.createFileDesc();
164 fileDesc.title = headline;
165 fileDesc.creationtime = sdf.format(calendar.getTime());
166 KAFDocument.Public aPublic = document.createPublic();
167 aPublic.uri = url;
168 aPublic.publicId = docno;
169
170 document.save(outputFile.getAbsolutePath());
171 }
172 }
173 }