1 package eu.fbk.dkm.pikes.resources.trec;
2
3 import com.google.common.base.Charsets;
4 import com.google.common.io.Files;
5 import eu.fbk.utils.core.CommandLine;
6 import ixa.kaflib.KAFDocument;
7 import org.joox.JOOX;
8 import org.slf4j.Logger;
9 import org.slf4j.LoggerFactory;
10 import org.w3c.dom.Document;
11 import org.w3c.dom.Element;
12 import org.xml.sax.SAXException;
13
14 import javax.xml.parsers.DocumentBuilder;
15 import javax.xml.parsers.DocumentBuilderFactory;
16 import javax.xml.parsers.ParserConfigurationException;
17 import java.io.ByteArrayInputStream;
18 import java.io.File;
19 import java.io.IOException;
20 import java.io.InputStream;
21 import java.text.DateFormat;
22 import java.text.SimpleDateFormat;
23 import java.util.Date;
24 import java.util.Locale;
25 import java.util.regex.Matcher;
26 import java.util.regex.Pattern;
27
28
29
30
31
32
33
34 public class LATIMES {
35
36 private static final Logger LOGGER = LoggerFactory.getLogger(LATIMES.class);
37 private static String DEFAULT_URL = "http://document/%s";
38 private static SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd'T'HH:mm:ss.SSSZ");
39 private static DateFormat format = new SimpleDateFormat("MMMM d, yyyy", Locale.ENGLISH);
40 private static Pattern datePattern = Pattern.compile("^([a-zA-Z]+\\s+[0-9]+,\\s+[0-9]+)");
41
42 public static void main(String[] args) {
43
44 try {
45
46 final CommandLine cmd = CommandLine
47 .parser()
48 .withName("latimes-extractor")
49 .withHeader("Extract LATIMES documents from TREC dataset and save them in NAF format")
50 .withOption("i", "input", "Input folder", "FOLDER", CommandLine.Type.DIRECTORY_EXISTING, true,
51 false, true)
52 .withOption("o", "output", "Output folder", "FOLDER", CommandLine.Type.DIRECTORY, true, false, true)
53 .withOption("u", "url-template", "URL template (with %d for the ID)", "URL",
54 CommandLine.Type.STRING, true, false, false)
55 .withLogger(LoggerFactory.getLogger("eu.fbk"))
56 .parse(args);
57
58 File inputDir = cmd.getOptionValue("input", File.class);
59
60 String urlTemplate = DEFAULT_URL;
61 if (cmd.hasOption("url-template")) {
62 urlTemplate = cmd.getOptionValue("url-template", String.class);
63 }
64
65 File outputDir = cmd.getOptionValue("output", File.class);
66 if (!outputDir.exists()) {
67 outputDir.mkdirs();
68 }
69
70 for (final File file : Files.fileTreeTraverser().preOrderTraversal(inputDir)) {
71 if (!file.isFile()) {
72 continue;
73 }
74 if (file.getName().startsWith(".")) {
75 continue;
76 }
77
78 String outputTemplate = outputDir.getAbsolutePath() + File.separator + file.getName();
79 File newFolder = new File(outputTemplate);
80 newFolder.mkdirs();
81
82 outputTemplate += File.separator + "NAF";
83 saveFile(file, outputTemplate, urlTemplate);
84 }
85 } catch (Exception e) {
86 CommandLine.fail(e);
87 }
88 }
89
90 private static void saveFile(File inputFile, String outputFilePattern, String urlTemplate)
91 throws IOException, SAXException, ParserConfigurationException {
92
93 LOGGER.info("Input file: {}", inputFile);
94
95 StringBuffer stringBuffer = new StringBuffer();
96 stringBuffer.append("<ROOT>\n");
97 stringBuffer.append(Files.toString(inputFile, Charsets.UTF_8));
98 stringBuffer.append("\n</ROOT>\n");
99
100 InputStream is = new ByteArrayInputStream(stringBuffer.toString().getBytes());
101 DocumentBuilderFactory dbFactory = DocumentBuilderFactory.newInstance();
102 DocumentBuilder dBuilder = dbFactory.newDocumentBuilder();
103 Document doc = dBuilder.parse(is);
104
105 doc.getDocumentElement().normalize();
106
107 int i = 0;
108 for (Element element : JOOX.$(doc).find("DOC")) {
109 Element docnoElement = JOOX.$(element).find("DOCNO").get(0);
110 Element dateElement = JOOX.$(element).find("DATE").get(0);
111 Element correctionDateElement = JOOX.$(element).find("CORRECTION-DATE").get(0);
112 Element headlineElement = JOOX.$(element).find("HEADLINE").get(0);
113
114 Element textElement = JOOX.$(element).find("TEXT").get(0);
115 Element graphicElement = JOOX.$(element).find("GRAPHIC").get(0);
116 Element correctionElement = JOOX.$(element).find("CORRECTION").get(0);
117
118
119 i++;
120 File outputFile = new File(outputFilePattern + "-" + i + ".naf");
121
122 String text = "";
123 if (textElement != null) {
124 text += textElement.getTextContent().trim() + "\n";
125 }
126 if (graphicElement != null) {
127 text += graphicElement.getTextContent().trim() + "\n";
128 }
129 if (correctionElement != null) {
130 text += correctionElement.getTextContent().trim() + "\n";
131 }
132
133 text = text.trim();
134
135 String headline = "";
136 if (headlineElement != null) {
137 headline = headlineElement.getTextContent().trim();
138 }
139
140 String docno = "";
141 if (docnoElement != null) {
142 docno = docnoElement.getTextContent().trim();
143 }
144
145 if (text.length() == 0 && headline.length() == 0) {
146 LOGGER.error("TEXT and HEADLINE are both empty ({})", docno);
147 continue;
148 }
149
150 String date = "";
151 if (dateElement != null) {
152 date = dateElement.getTextContent().trim();
153 }
154
155 if (docno.equals("")) {
156 LOGGER.error("DOCNO is empty");
157 }
158
159 String url = String.format(urlTemplate, docno);
160
161 headline = headline.replace('\n', ' ');
162 headline = headline.replaceAll("\\s+", " ");
163
164 Date thisDate = null;
165 Matcher matcher = datePattern.matcher(date);
166 if (matcher.find()) {
167 try {
168 thisDate = format.parse(matcher.group(1));
169 } catch (Exception e) {
170
171 }
172 }
173 if (thisDate == null && correctionDateElement != null) {
174 date = correctionDateElement.getTextContent().trim();
175 matcher = datePattern.matcher(date);
176 if (matcher.find()) {
177 try {
178 thisDate = format.parse(matcher.group(1));
179 } catch (Exception e) {
180
181 }
182 }
183 }
184
185 text = headline + "\n\n" + text;
186
187 KAFDocument document = new KAFDocument("en", "v3");
188 document.setRawText(text);
189
190 KAFDocument.FileDesc fileDesc = document.createFileDesc();
191 fileDesc.title = headline;
192 if (thisDate != null) {
193 fileDesc.creationtime = sdf.format(thisDate);
194 }
195 KAFDocument.Public aPublic = document.createPublic();
196 aPublic.uri = url;
197 aPublic.publicId = docno;
198
199 document.save(outputFile.getAbsolutePath());
200 }
201 }
202 }