1 package eu.fbk.dkm.pikes.resources.tackbp;
2
3 import java.io.IOException;
4 import java.io.Reader;
5 import java.io.StringReader;
6 import java.io.Writer;
7 import java.nio.file.Path;
8 import java.text.Normalizer;
9 import java.text.SimpleDateFormat;
10 import java.util.Collection;
11 import java.util.Date;
12 import java.util.List;
13 import java.util.Map;
14 import java.util.Objects;
15 import java.util.Properties;
16 import java.util.regex.Pattern;
17 import java.util.stream.Collectors;
18
19 import javax.annotation.Nullable;
20 import javax.xml.parsers.DocumentBuilder;
21 import javax.xml.parsers.DocumentBuilderFactory;
22
23 import com.google.common.base.Joiner;
24 import com.google.common.base.Strings;
25 import com.google.common.base.Throwables;
26 import com.google.common.collect.HashMultimap;
27 import com.google.common.collect.HashMultiset;
28 import com.google.common.collect.ImmutableList;
29 import com.google.common.collect.Iterables;
30 import com.google.common.collect.Lists;
31 import com.google.common.collect.Maps;
32 import com.google.common.collect.Multimap;
33 import com.google.common.collect.Multiset;
34 import com.google.common.io.CharStreams;
35
36 import org.apache.commons.lang.StringEscapeUtils;
37 import org.apache.commons.lang.StringUtils;
38 import org.slf4j.Logger;
39 import org.slf4j.LoggerFactory;
40 import org.w3c.dom.Element;
41 import org.w3c.dom.Node;
42 import org.w3c.dom.NodeList;
43 import org.w3c.dom.Text;
44 import org.xml.sax.InputSource;
45
46 import eu.fbk.rdfpro.util.IO;
47 import eu.fbk.utils.core.CommandLine;
48
49 import edu.stanford.nlp.ling.CoreAnnotations.SentencesAnnotation;
50 import edu.stanford.nlp.ling.CoreAnnotations.TokensAnnotation;
51 import edu.stanford.nlp.ling.CoreLabel;
52 import edu.stanford.nlp.pipeline.Annotation;
53 import edu.stanford.nlp.pipeline.StanfordCoreNLP;
54 import edu.stanford.nlp.util.CoreMap;
55 import ixa.kaflib.KAFDocument;
56
57
58
59
60
61
62 public final class ConverterToNAF {
63
64 private static final Logger LOGGER = LoggerFactory.getLogger(ConverterToNAF.class);
65
66 private static final SimpleDateFormat XML_DATE_FORMAT = new SimpleDateFormat("yyyy-MM-dd");
67
68 private static final SimpleDateFormat NAF_DATE_FORMAT = new SimpleDateFormat(
69 "yyyy-MM-dd'T'HH:mm:ss.SSSZ");
70
71 private static String DEFAULT_URL = "http://pikes.fbk.eu/tackbp/%s";
72
73 private static final DocumentBuilder DOCUMENT_BUILDER;
74
75 private static final StanfordCoreNLP TOKENIZE_PIPELINE;
76
77 static {
78 try {
79 DOCUMENT_BUILDER = DocumentBuilderFactory.newInstance().newDocumentBuilder();
80
81 final Properties props = new Properties();
82 props.setProperty("annotators", "tokenize, ssplit");
83 props.setProperty("tokenize.americanize", "false");
84 props.setProperty("tokenize.normalizeParentheses", "false");
85 props.setProperty("tokenize.normalizeOtherBrackets", "false");
86 props.setProperty("tokenize.escapeForwardSlashAsterisk", "false");
87 props.setProperty("tokenize.untokenizable", "noneKeep");
88 props.setProperty("tokenize.asciiQuotes", "true");
89 props.setProperty("tokenize.normalizeSpace", "false");
90 TOKENIZE_PIPELINE = new StanfordCoreNLP(props);
91
92 } catch (final Throwable ex) {
93 throw new Error(ex);
94 }
95 }
96
97 public static void main(final String... args) {
98 try {
99
100 final CommandLine cmd = CommandLine.parser().withName("tackbp-converter-to-naf")
101 .withHeader(
102 "Generates input and gold NAFs for the TAC KBP corpus (2011 format)")
103 .withOption("t", "txt",
104 "the TAC KBP '.txt' file containing article texts "
105 + "(e.g., tac2011test_docs.txt)",
106 "FILE", CommandLine.Type.FILE_EXISTING, true, false, true)
107 .withOption("k", "key",
108 "the TAC KBP '.key' file containing expected results "
109 + "(e.g., tac2011test_wiki2011.key)",
110 "FILE", CommandLine.Type.FILE_EXISTING, true, false, true)
111 .withOption("n", "naf", "the FOLDER where to write NAFs", "FOLDER",
112 CommandLine.Type.DIRECTORY_EXISTING, true, false, true)
113 .withOption("c", "conll",
114 "the FILE where to write the gold NERC data in the CONLL format",
115 "FILE", CommandLine.Type.FILE, true, false, false)
116 .withOption("a", "aida",
117 "the FILE where to write the gold EL data in the AIDA format", "FILE",
118 CommandLine.Type.FILE, true, false, false)
119 .withOption("u", "url-template", "URL template (with %s for the document ID)",
120 "URL", CommandLine.Type.STRING, true, false, false)
121 .withLogger(LoggerFactory.getLogger("eu.fbk")).parse(args);
122
123
124 final Path txtPath = cmd.getOptionValue("t", Path.class);
125 final Path keyPath = cmd.getOptionValue("k", Path.class);
126 final Path nafPath = cmd.getOptionValue("n", Path.class);
127 final Path conllPath = cmd.getOptionValue("c", Path.class);
128 final Path aidaPath = cmd.getOptionValue("a", Path.class);
129 final String urlTemplate = cmd.getOptionValue("u", String.class, DEFAULT_URL);
130
131
132 final List<Document> documents = parse(txtPath, keyPath);
133
134
135 generate(nafPath, conllPath, aidaPath, documents, urlTemplate);
136
137 } catch (final Throwable ex) {
138
139 CommandLine.fail(ex);
140 }
141 }
142
143 private static List<Document> parse(final Path txtPath, final Path keyPath)
144 throws IOException {
145
146
147 final Multimap<String, Query> queries = HashMultimap.create();
148 final Multiset<String> nercClasses = HashMultiset.create();
149 int nilCount = 0;
150 try (Reader in = IO.utf8Reader(IO.buffer(IO.read(keyPath.toString())))) {
151 for (final String line : CharStreams.readLines(in)) {
152
153
154 final String[] fields = line.split("\t");
155 final String docId = fields[0];
156 final String queryId = fields[1];
157 final String queryText = fields[3];
158 final String expectedNelId = fields[4];
159 final String expectedNercId = fields[5];
160
161
162 final Query query = new Query(docId, queryId, queryText, expectedNercId,
163 expectedNelId);
164 queries.put(docId, query);
165
166
167 nercClasses.add(expectedNercId);
168 nilCount += query.expectedNelUri == null ? 1 : 0;
169 }
170 }
171 final int numQueries = queries.size();
172
173
174 final Map<String, Document> documents = Maps.newHashMap();
175 try (Reader in = IO.utf8Reader(IO.buffer(IO.read(txtPath.toString())))) {
176 for (final String line : CharStreams.readLines(in)) {
177
178
179 final String[] fields = line.split("\t");
180 final String docId = fields[0];
181 final String queryId = fields[1];
182 final String queryText = fields[3];
183 final String docText = fields[4];
184
185
186 Document document = documents.get(docId);
187 if (document == null) {
188
189 final Collection<Query> docQueries = queries.get(docId);
190 if (docQueries.isEmpty()) {
191 LOGGER.warn("No matching entry(ies) in .key file for document " + docId);
192 }
193 document = new Document(docId, docText, queries.get(docId));
194 documents.put(docId, document);
195
196 } else {
197
198 if (!document.docXml.equals(docText)) {
199 LOGGER.warn("Different texts for document " + docId);
200 }
201 }
202
203
204 final Query matchingQuery = document.queries.stream()
205 .filter(q -> q.queryId.equals(queryId)).findFirst().orElse(null);
206 if (matchingQuery == null) {
207 LOGGER.warn("No entry in .key file for document " + docId + " and query "
208 + queryId);
209 } else if (!matchingQuery.queryText.equals(queryText)) {
210 LOGGER.warn("Different query text for .txt and .key files for query " + queryId
211 + ": " + queryText + " - " + matchingQuery.queryText);
212 }
213
214
215 if (matchingQuery != null) {
216 queries.remove(docId, matchingQuery);
217 }
218 }
219 }
220
221
222 if (!queries.values().isEmpty()) {
223 final StringBuilder builder = new StringBuilder(
224 "There are .key query entries not referenced in the .txt file:");
225 for (final Query query : queries.values()) {
226 builder.append(" ").append(query.docId).append(" ").append(query.queryId);
227 }
228 LOGGER.warn(builder.toString());
229 }
230
231
232 LOGGER.info("Parsed {} query entries for {} documents, {} NILs, {} NERC classes: {}",
233 numQueries, documents.size(), nilCount, nercClasses.elementSet().size(),
234 Joiner.on(", ").join(nercClasses.entrySet().stream()
235 .map(e -> e.getElement() + ":" + e.getCount()).toArray()));
236
237
238 final List<Document> sortedDocuments = Lists.newArrayList(documents.values());
239 sortedDocuments.sort((d1, d2) -> d1.docId.compareTo(d2.docId));
240
241
242 return sortedDocuments;
243 }
244
245 private static void generate(final Path nafPath, final Path conllPath, final Path aidaPath,
246 final Iterable<Document> documents, final String urlTemplate) throws IOException {
247
248
249 Writer conllWriter = null;
250 Writer aidaWriter = null;
251
252 try {
253
254 conllWriter = IO.utf8Writer(IO.buffer(IO.write(conllPath.toString())));
255 aidaWriter = IO.utf8Writer(IO.buffer(IO.write(aidaPath.toString())));
256
257
258 int nafCount = 0;
259 for (final Document document : documents) {
260
261
262 final KAFDocument naf = new KAFDocument("en", "v3");
263
264
265 naf.setRawText(Joiner.on("\n").join(document.docTokens.stream()
266 .map(l -> Joiner.on(" ").join(l)).collect(Collectors.toList())));
267
268
269 final KAFDocument.FileDesc fileDesc = naf.createFileDesc();
270 fileDesc.title = document.docTitle;
271 fileDesc.creationtime = NAF_DATE_FORMAT.format(document.docDate);
272 fileDesc.filename = document.docId;
273 fileDesc.filetype = document.docSource + "/" + document.docType;
274
275
276 final KAFDocument.Public aPublic = naf.createPublic();
277 aPublic.uri = String.format(urlTemplate, document.docId);
278 aPublic.publicId = document.docId;
279
280
281 final Path outFile = nafPath.resolve(document.docId + ".naf");
282 naf.save(outFile.toFile());
283
284
285 conllWriter.write("-DOCSTART- " + document.docId + " O O\n");
286 aidaWriter.write("-DOCSTART- (" + document.docId + ")\n");
287
288
289 for (final List<String> sentence : document.docTokens) {
290
291
292 if (sentence.isEmpty()) {
293 continue;
294 }
295
296
297 final Query[] mentions = new Query[sentence.size()];
298 for (final Query query : document.queries) {
299 final List<String> queryTokens = ImmutableList
300 .copyOf(Iterables.concat(tokenize(query.queryText)));
301 outer: for (int i = 0; i < sentence.size(); ++i) {
302 for (int j = 0; j < queryTokens.size(); ++j) {
303 if (mentions[i + j] != null || !sentence.get(i + j)
304 .equalsIgnoreCase(queryTokens.get(j))) {
305 continue outer;
306 }
307 }
308 for (int j = 0; j < queryTokens.size(); ++j) {
309 mentions[i + j] = query;
310 }
311 }
312 }
313
314
315 for (int i = 0; i < sentence.size(); ++i) {
316
317
318 String nercTag = "O";
319 if (mentions[i] != null) {
320 final boolean b = i > 0 && mentions[i - 1] != null
321 && mentions[i - 1] != mentions[i] && mentions[i
322 - 1].expectedNercClass == mentions[i].expectedNercClass;
323 nercTag = (b ? "B-" : "I-")
324 + mentions[i].expectedNercClass.toUpperCase();
325 }
326
327
328 String elTag = null;
329 String elAnchor = null;
330 if (mentions[i] != null) {
331 final boolean b = i == 0 || mentions[i - 1] != mentions[i];
332 elTag = b ? "B" : "I";
333 elAnchor = sentence.get(i);
334 for (int j = i - 1; j >= 0 && mentions[j] == mentions[i]; --j) {
335 elAnchor = sentence.get(j) + " " + elAnchor;
336 }
337 for (int j = i + 1; j < sentence.size()
338 && mentions[j] == mentions[i]; ++j) {
339 elAnchor = elAnchor + " " + sentence.get(j);
340 }
341 }
342
343
344 final String token = sentence.get(i);
345 conllWriter.write(token + " - - " + nercTag + "\n");
346 if (nafCount < 58000) {
347 aidaWriter.write(token + (elTag == null ? ""
348 : "\t" + elTag + "\t" + elAnchor + "\t"
349 + (mentions[i].expectedNelUri == null ? "--NME--"
350 : mentions[i].expectedNelId + "\t"
351 + mentions[i].expectedNelUri
352 + "\t0\t/m/x"))
353 + "\n");
354 }
355 }
356
357
358 conllWriter.write("\n");
359 aidaWriter.write("\n");
360 }
361
362
363 ++nafCount;
364 }
365
366
367 LOGGER.info("{} NAF files emitted in {}", nafCount, nafPath);
368
369 } finally {
370
371 IO.closeQuietly(conllWriter);
372 IO.closeQuietly(aidaWriter);
373 }
374 }
375
376 private static List<List<String>> tokenize(final String string) {
377
378
379 final Annotation annotation = new Annotation(string);
380 TOKENIZE_PIPELINE.annotate(annotation);
381
382
383 final List<List<String>> tokens = Lists.newArrayList();
384 for (final CoreMap sentence : annotation.get(SentencesAnnotation.class)) {
385 final List<String> sentenceTokens = Lists.newArrayList();
386 tokens.add(sentenceTokens);
387 for (final CoreLabel token : sentence.get(TokensAnnotation.class)) {
388 final String text = ascii(token.originalText());
389
390
391
392 for (final String t : text.split("\\s+")) {
393 if (!Strings.isNullOrEmpty(t)) {
394 sentenceTokens.add(t);
395 }
396 }
397 }
398 }
399 return tokens;
400 }
401
402 private static String ascii(final String string) {
403 final StringBuilder builder = new StringBuilder(string.length());
404 for (int i = 0; i < string.length(); ++i) {
405 final char ch = string.charAt(i);
406 if (ch >= 32 && ch < 127) {
407 builder.append(ch);
408 } else if (ch == '©') {
409 builder.append("(c)");
410 } else if (ch == '™') {
411 builder.append("(tm)");
412 } else if (ch == '®') {
413 builder.append("(r)");
414 } else if (ch == '•' || ch == '·') {
415 builder.append("*");
416 } else if (ch == 'Ø') {
417 builder.append("0");
418 } else if (ch == '‑') {
419 builder.append("-");
420 } else if (ch == '´') {
421 builder.append("'");
422 } else if (ch == '¨') {
423 builder.append("\"");
424 } else if (ch == '¸' || ch == ',') {
425 builder.append(",");
426 } else {
427 final String s = Normalizer.normalize("" + ch, Normalizer.Form.NFD);
428 for (final char c : s.toCharArray()) {
429 if (c <= '\u007F') {
430 builder.append(c);
431 } else {
432
433 }
434 }
435 }
436 }
437 final String result = builder.toString();
438 if (!result.equals(string)) {
439 LOGGER.warn("Normalized {} to {}", string, result);
440 }
441 return StringEscapeUtils.unescapeXml(result);
442 }
443
444 private static final class Document {
445
446 final String docId;
447
448 final String docXml;
449
450 final Date docDate;
451
452 final String docSource;
453
454 final String docType;
455
456 final String docTitle;
457
458 final List<List<String>> docTokens;
459
460 final List<Query> queries;
461
462 public Document(final String docId, final String docXml, final Iterable<Query> queries) {
463
464
465 this.docId = Objects.requireNonNull(docId);
466 this.docXml = Objects.requireNonNull(docXml);
467 this.queries = ImmutableList.copyOf(queries);
468
469 try {
470
471 final org.w3c.dom.Document document = DOCUMENT_BUILDER
472 .parse(new InputSource(new StringReader(docXml)));
473
474
475 final NodeList docIdNodes = document.getElementsByTagName("DOCID");
476 if (docIdNodes.getLength() == 1) {
477 final String parsedDocId = docIdNodes.item(0).getTextContent().trim();
478 if (!docId.equals(parsedDocId)) {
479 LOGGER.warn("DOCID XML element " + parsedDocId
480 + " does not match ID of document " + docId);
481 }
482 }
483
484
485 Date date = new Date();
486 final NodeList dateNodes = document.getElementsByTagName("DATETIME");
487 if (dateNodes.getLength() == 1) {
488 final String dateStr = dateNodes.item(0).getTextContent().trim();
489 try {
490 date = XML_DATE_FORMAT.parse(dateStr);
491 } catch (final Throwable ex) {
492 LOGGER.warn("Could not parse <DATETIME> value " + dateStr);
493 }
494 } else {
495 LOGGER.warn("No <DATETIME> XML element for document " + docId);
496 }
497 this.docDate = date;
498
499
500 final NodeList doctypeNodes = document.getElementsByTagName("DOCTYPE");
501 if (doctypeNodes.getLength() == 1) {
502 final Element doctypeElement = (Element) doctypeNodes.item(0);
503 this.docSource = toNormalCase(doctypeElement.getAttribute("SOURCE").trim());
504 this.docType = toNormalCase(doctypeElement.getTextContent().trim());
505 } else {
506 LOGGER.warn("No <DOCTYPE> XML element for document " + docId);
507 this.docSource = "";
508 this.docType = "";
509 }
510
511
512 final NodeList headlineNodes = document.getElementsByTagName("HEADLINE");
513 if (headlineNodes.getLength() == 1) {
514 this.docTitle = toNormalCase(headlineNodes.item(0).getTextContent().trim());
515 } else {
516 this.docTitle = "";
517 LOGGER.warn("No <HEADLINE> XML element for document " + docId);
518 }
519
520
521 final List<List<String>> tokens = Lists.newArrayList();
522 if (!this.docTitle.isEmpty()) {
523 tokens.addAll(tokenize(toSentence(this.docTitle)));
524 }
525 collectText(tokens, document.getElementsByTagName("TEXT"));
526 this.docTokens = ImmutableList.copyOf(tokens);
527 if (this.docTokens.size() <= 1) {
528 LOGGER.warn("No text extracted for document " + docId);
529 }
530
531 } catch (final Throwable ex) {
532 Throwables.throwIfUnchecked(ex);
533 throw new RuntimeException(ex);
534 }
535 }
536
537 private static void collectText(final List<List<String>> tokens, final NodeList nodes) {
538 for (int i = 0; i < nodes.getLength(); ++i) {
539 final Node node = nodes.item(i);
540 if (node instanceof Text) {
541 final String text = node.getTextContent().trim();
542 if (!text.isEmpty()) {
543 final List<List<String>> paragraph = tokenize(text);
544 tokens.add(ImmutableList.of());
545 tokens.addAll(paragraph);
546 }
547 } else {
548 collectText(tokens, node.getChildNodes());
549 }
550 }
551 }
552
553 private static String toNormalCase(final String string) {
554 final String result = string.toUpperCase().equals(string) ? string.toLowerCase()
555 : string;
556 return result;
557 }
558
559 private static String toSentence(String string) {
560 string = StringUtils.capitalize(string).trim();
561 return string.endsWith(".") ? string : string + ".";
562 }
563
564 }
565
566 private static final class Query {
567
568 private static final Pattern NIL_PATTERN = Pattern.compile("NIL[0-9]+");
569
570 final String docId;
571
572 final String queryId;
573
574 final String queryText;
575
576 final String expectedNercClass;
577
578 final String expectedNelId;
579
580 @Nullable
581 final String expectedNelUri;
582
583 public Query(final String docId, final String queryId, final String queryText,
584 final String expectedNercClass, final String expectedNelId) {
585
586 this.docId = Objects.requireNonNull(docId);
587 this.queryId = Objects.requireNonNull(queryId);
588 this.queryText = Objects.requireNonNull(queryText);
589 this.expectedNercClass = Objects.requireNonNull(expectedNercClass).replace("GPE",
590 "LOC");
591 this.expectedNelId = Objects.requireNonNull(expectedNelId);
592 this.expectedNelUri = NIL_PATTERN.matcher(expectedNelId).matches() ? null
593 : "http://dbpedia.org/resource/" + expectedNelId;
594 }
595
596 }
597
598 }