1   package eu.fbk.dkm.pikes.resources.tackbp;
2   
3   import java.io.IOException;
4   import java.io.Reader;
5   import java.io.StringReader;
6   import java.io.Writer;
7   import java.nio.file.Path;
8   import java.text.Normalizer;
9   import java.text.SimpleDateFormat;
10  import java.util.Collection;
11  import java.util.Date;
12  import java.util.List;
13  import java.util.Map;
14  import java.util.Objects;
15  import java.util.Properties;
16  import java.util.regex.Pattern;
17  import java.util.stream.Collectors;
18  
19  import javax.annotation.Nullable;
20  import javax.xml.parsers.DocumentBuilder;
21  import javax.xml.parsers.DocumentBuilderFactory;
22  
23  import com.google.common.base.Joiner;
24  import com.google.common.base.Strings;
25  import com.google.common.base.Throwables;
26  import com.google.common.collect.HashMultimap;
27  import com.google.common.collect.HashMultiset;
28  import com.google.common.collect.ImmutableList;
29  import com.google.common.collect.Iterables;
30  import com.google.common.collect.Lists;
31  import com.google.common.collect.Maps;
32  import com.google.common.collect.Multimap;
33  import com.google.common.collect.Multiset;
34  import com.google.common.io.CharStreams;
35  
36  import org.apache.commons.lang.StringEscapeUtils;
37  import org.apache.commons.lang.StringUtils;
38  import org.slf4j.Logger;
39  import org.slf4j.LoggerFactory;
40  import org.w3c.dom.Element;
41  import org.w3c.dom.Node;
42  import org.w3c.dom.NodeList;
43  import org.w3c.dom.Text;
44  import org.xml.sax.InputSource;
45  
46  import eu.fbk.rdfpro.util.IO;
47  import eu.fbk.utils.core.CommandLine;
48  
49  import edu.stanford.nlp.ling.CoreAnnotations.SentencesAnnotation;
50  import edu.stanford.nlp.ling.CoreAnnotations.TokensAnnotation;
51  import edu.stanford.nlp.ling.CoreLabel;
52  import edu.stanford.nlp.pipeline.Annotation;
53  import edu.stanford.nlp.pipeline.StanfordCoreNLP;
54  import edu.stanford.nlp.util.CoreMap;
55  import ixa.kaflib.KAFDocument;
56  
57  /**
58   * Converts the TAC KBP corpus (2011 format) to NAF.
59   *
60   * @author Francesco Corcoglioniti <corcoglio@fbk.eu> (created 2017-09-22)
61   */
62  public final class ConverterToNAF {
63  
64      private static final Logger LOGGER = LoggerFactory.getLogger(ConverterToNAF.class);
65  
66      private static final SimpleDateFormat XML_DATE_FORMAT = new SimpleDateFormat("yyyy-MM-dd");
67  
68      private static final SimpleDateFormat NAF_DATE_FORMAT = new SimpleDateFormat(
69              "yyyy-MM-dd'T'HH:mm:ss.SSSZ");
70  
71      private static String DEFAULT_URL = "http://pikes.fbk.eu/tackbp/%s";
72  
73      private static final DocumentBuilder DOCUMENT_BUILDER;
74  
75      private static final StanfordCoreNLP TOKENIZE_PIPELINE;
76  
77      static {
78          try {
79              DOCUMENT_BUILDER = DocumentBuilderFactory.newInstance().newDocumentBuilder();
80  
81              final Properties props = new Properties();
82              props.setProperty("annotators", "tokenize, ssplit");
83              props.setProperty("tokenize.americanize", "false");
84              props.setProperty("tokenize.normalizeParentheses", "false");
85              props.setProperty("tokenize.normalizeOtherBrackets", "false");
86              props.setProperty("tokenize.escapeForwardSlashAsterisk", "false");
87              props.setProperty("tokenize.untokenizable", "noneKeep");
88              props.setProperty("tokenize.asciiQuotes", "true");
89              props.setProperty("tokenize.normalizeSpace", "false");
90              TOKENIZE_PIPELINE = new StanfordCoreNLP(props);
91  
92          } catch (final Throwable ex) {
93              throw new Error(ex);
94          }
95      }
96  
97      public static void main(final String... args) {
98          try {
99              // Parse command line
100             final CommandLine cmd = CommandLine.parser().withName("tackbp-converter-to-naf")
101                     .withHeader(
102                             "Generates input and gold NAFs for the TAC KBP corpus (2011 format)")
103                     .withOption("t", "txt",
104                             "the TAC KBP '.txt' file containing article texts "
105                                     + "(e.g., tac2011test_docs.txt)",
106                             "FILE", CommandLine.Type.FILE_EXISTING, true, false, true)
107                     .withOption("k", "key",
108                             "the TAC KBP '.key' file containing expected results "
109                                     + "(e.g., tac2011test_wiki2011.key)",
110                             "FILE", CommandLine.Type.FILE_EXISTING, true, false, true)
111                     .withOption("n", "naf", "the FOLDER where to write NAFs", "FOLDER",
112                             CommandLine.Type.DIRECTORY_EXISTING, true, false, true)
113                     .withOption("c", "conll",
114                             "the FILE where to write the gold NERC data in the CONLL format",
115                             "FILE", CommandLine.Type.FILE, true, false, false)
116                     .withOption("a", "aida",
117                             "the FILE where to write the gold EL data in the AIDA format", "FILE",
118                             CommandLine.Type.FILE, true, false, false)
119                     .withOption("u", "url-template", "URL template (with %s for the document ID)",
120                             "URL", CommandLine.Type.STRING, true, false, false)
121                     .withLogger(LoggerFactory.getLogger("eu.fbk")).parse(args);
122 
123             // Read options
124             final Path txtPath = cmd.getOptionValue("t", Path.class);
125             final Path keyPath = cmd.getOptionValue("k", Path.class);
126             final Path nafPath = cmd.getOptionValue("n", Path.class);
127             final Path conllPath = cmd.getOptionValue("c", Path.class);
128             final Path aidaPath = cmd.getOptionValue("a", Path.class);
129             final String urlTemplate = cmd.getOptionValue("u", String.class, DEFAULT_URL);
130 
131             // Parse input .txt and .key files
132             final List<Document> documents = parse(txtPath, keyPath);
133 
134             // Generate NAFs, CONLL and AIDA
135             generate(nafPath, conllPath, aidaPath, documents, urlTemplate);
136 
137         } catch (final Throwable ex) {
138             // Handle failure
139             CommandLine.fail(ex);
140         }
141     }
142 
143     private static List<Document> parse(final Path txtPath, final Path keyPath)
144             throws IOException {
145 
146         // Parse queries from the .key file
147         final Multimap<String, Query> queries = HashMultimap.create();
148         final Multiset<String> nercClasses = HashMultiset.create();
149         int nilCount = 0;
150         try (Reader in = IO.utf8Reader(IO.buffer(IO.read(keyPath.toString())))) {
151             for (final String line : CharStreams.readLines(in)) {
152 
153                 // Extract fields from current row
154                 final String[] fields = line.split("\t");
155                 final String docId = fields[0];
156                 final String queryId = fields[1];
157                 final String queryText = fields[3];
158                 final String expectedNelId = fields[4];
159                 final String expectedNercId = fields[5];
160 
161                 // Create and index query
162                 final Query query = new Query(docId, queryId, queryText, expectedNercId,
163                         expectedNelId);
164                 queries.put(docId, query);
165 
166                 // Update statistics
167                 nercClasses.add(expectedNercId);
168                 nilCount += query.expectedNelUri == null ? 1 : 0;
169             }
170         }
171         final int numQueries = queries.size();
172 
173         // Parse documents from the .txt file, associating them to corresponding queries
174         final Map<String, Document> documents = Maps.newHashMap();
175         try (Reader in = IO.utf8Reader(IO.buffer(IO.read(txtPath.toString())))) {
176             for (final String line : CharStreams.readLines(in)) {
177 
178                 // Extract fields from current row
179                 final String[] fields = line.split("\t");
180                 final String docId = fields[0];
181                 final String queryId = fields[1];
182                 final String queryText = fields[3];
183                 final String docText = fields[4];
184 
185                 // Handle two cases to deal with documents having multiple queries
186                 Document document = documents.get(docId);
187                 if (document == null) {
188                     // Create document when first encountered, associating it to all its queries
189                     final Collection<Query> docQueries = queries.get(docId);
190                     if (docQueries.isEmpty()) {
191                         LOGGER.warn("No matching entry(ies) in .key file for document " + docId);
192                     }
193                     document = new Document(docId, docText, queries.get(docId));
194                     documents.put(docId, document);
195 
196                 } else {
197                     // Verify that the document text is the same
198                     if (!document.docXml.equals(docText)) {
199                         LOGGER.warn("Different texts for document " + docId);
200                     }
201                 }
202 
203                 // Check there is a corresponding .key query entry for the current .txt row
204                 final Query matchingQuery = document.queries.stream()
205                         .filter(q -> q.queryId.equals(queryId)).findFirst().orElse(null);
206                 if (matchingQuery == null) {
207                     LOGGER.warn("No entry in .key file for document " + docId + " and query "
208                             + queryId);
209                 } else if (!matchingQuery.queryText.equals(queryText)) {
210                     LOGGER.warn("Different query text for .txt and .key files for query " + queryId
211                             + ": " + queryText + " - " + matchingQuery.queryText);
212                 }
213 
214                 // Remove matching query, so that at the end we can detect unreferenced queries
215                 if (matchingQuery != null) {
216                     queries.remove(docId, matchingQuery);
217                 }
218             }
219         }
220 
221         // Check there are no unreferenced queries in the .key file
222         if (!queries.values().isEmpty()) {
223             final StringBuilder builder = new StringBuilder(
224                     "There are .key query entries not referenced in the .txt file:");
225             for (final Query query : queries.values()) {
226                 builder.append(" ").append(query.docId).append(" ").append(query.queryId);
227             }
228             LOGGER.warn(builder.toString());
229         }
230 
231         // Log parse results / statistics
232         LOGGER.info("Parsed {} query entries for {} documents, {} NILs, {} NERC classes: {}",
233                 numQueries, documents.size(), nilCount, nercClasses.elementSet().size(),
234                 Joiner.on(", ").join(nercClasses.entrySet().stream()
235                         .map(e -> e.getElement() + ":" + e.getCount()).toArray()));
236 
237         // Sort documents
238         final List<Document> sortedDocuments = Lists.newArrayList(documents.values());
239         sortedDocuments.sort((d1, d2) -> d1.docId.compareTo(d2.docId));
240 
241         // Return parsed documents
242         return sortedDocuments;
243     }
244 
245     private static void generate(final Path nafPath, final Path conllPath, final Path aidaPath,
246             final Iterable<Document> documents, final String urlTemplate) throws IOException {
247 
248         // Writers for CONLL and AIDA data
249         Writer conllWriter = null;
250         Writer aidaWriter = null;
251 
252         try {
253             // Open CONLL and AIDA files for writing
254             conllWriter = IO.utf8Writer(IO.buffer(IO.write(conllPath.toString())));
255             aidaWriter = IO.utf8Writer(IO.buffer(IO.write(aidaPath.toString())));
256 
257             // Generate and emit one NAF at a time
258             int nafCount = 0;
259             for (final Document document : documents) {
260 
261                 // Create NAF document
262                 final KAFDocument naf = new KAFDocument("en", "v3");
263 
264                 // Set text
265                 naf.setRawText(Joiner.on("\n").join(document.docTokens.stream()
266                         .map(l -> Joiner.on(" ").join(l)).collect(Collectors.toList())));
267 
268                 // Set title, date, ID, source/type in the fileDesc structure
269                 final KAFDocument.FileDesc fileDesc = naf.createFileDesc();
270                 fileDesc.title = document.docTitle;
271                 fileDesc.creationtime = NAF_DATE_FORMAT.format(document.docDate);
272                 fileDesc.filename = document.docId;
273                 fileDesc.filetype = document.docSource + "/" + document.docType;
274 
275                 // Set URI and ID in the public structure
276                 final KAFDocument.Public aPublic = naf.createPublic();
277                 aPublic.uri = String.format(urlTemplate, document.docId);
278                 aPublic.publicId = document.docId;
279 
280                 // Write NAF to file
281                 final Path outFile = nafPath.resolve(document.docId + ".naf");
282                 naf.save(outFile.toFile());
283 
284                 // Emit document start tags for both CONLL and AIDA files
285                 conllWriter.write("-DOCSTART- " + document.docId + " O O\n");
286                 aidaWriter.write("-DOCSTART- (" + document.docId + ")\n");
287 
288                 // Iterate over text sentences in the document
289                 for (final List<String> sentence : document.docTokens) {
290 
291                     // Skip empty sentences
292                     if (sentence.isEmpty()) {
293                         continue;
294                     }
295 
296                     // Locate mentions of queries inside the sentence tokens
297                     final Query[] mentions = new Query[sentence.size()];
298                     for (final Query query : document.queries) {
299                         final List<String> queryTokens = ImmutableList
300                                 .copyOf(Iterables.concat(tokenize(query.queryText)));
301                         outer: for (int i = 0; i < sentence.size(); ++i) {
302                             for (int j = 0; j < queryTokens.size(); ++j) {
303                                 if (mentions[i + j] != null || !sentence.get(i + j)
304                                         .equalsIgnoreCase(queryTokens.get(j))) {
305                                     continue outer;
306                                 }
307                             }
308                             for (int j = 0; j < queryTokens.size(); ++j) {
309                                 mentions[i + j] = query;
310                             }
311                         }
312                     }
313 
314                     // Emit sentence to both CONLL and AIDA files
315                     for (int i = 0; i < sentence.size(); ++i) {
316 
317                         // Determine NERC tag for current token
318                         String nercTag = "O";
319                         if (mentions[i] != null) {
320                             final boolean b = i > 0 && mentions[i - 1] != null
321                                     && mentions[i - 1] != mentions[i] && mentions[i
322                                             - 1].expectedNercClass == mentions[i].expectedNercClass;
323                             nercTag = (b ? "B-" : "I-")
324                                     + mentions[i].expectedNercClass.toUpperCase();
325                         }
326 
327                         // Determine EL tag for current token
328                         String elTag = null;
329                         String elAnchor = null;
330                         if (mentions[i] != null) {
331                             final boolean b = i == 0 || mentions[i - 1] != mentions[i];
332                             elTag = b ? "B" : "I";
333                             elAnchor = sentence.get(i);
334                             for (int j = i - 1; j >= 0 && mentions[j] == mentions[i]; --j) {
335                                 elAnchor = sentence.get(j) + " " + elAnchor;
336                             }
337                             for (int j = i + 1; j < sentence.size()
338                                     && mentions[j] == mentions[i]; ++j) {
339                                 elAnchor = elAnchor + " " + sentence.get(j);
340                             }
341                         }
342 
343                         // Emit current token to both CONLL and AIDA files
344                         final String token = sentence.get(i);
345                         conllWriter.write(token + " - - " + nercTag + "\n");
346                         if (nafCount < 58000) {
347                             aidaWriter.write(token + (elTag == null ? ""
348                                     : "\t" + elTag + "\t" + elAnchor + "\t"
349                                             + (mentions[i].expectedNelUri == null ? "--NME--"
350                                                     : mentions[i].expectedNelId + "\t"
351                                                             + mentions[i].expectedNelUri
352                                                             + "\t0\t/m/x"))
353                                     + "\n");
354                         }
355                     }
356 
357                     // Write empty line to separate sentences both CONLL and AIDA files
358                     conllWriter.write("\n");
359                     aidaWriter.write("\n");
360                 }
361 
362                 // Increase number of processed NAFs
363                 ++nafCount;
364             }
365 
366             // Log results
367             LOGGER.info("{} NAF files emitted in {}", nafCount, nafPath);
368 
369         } finally {
370             // Close CONLL and AIDA output files
371             IO.closeQuietly(conllWriter);
372             IO.closeQuietly(aidaWriter);
373         }
374     }
375 
376     private static List<List<String>> tokenize(final String string) {
377 
378         // Tokenize
379         final Annotation annotation = new Annotation(string);
380         TOKENIZE_PIPELINE.annotate(annotation);
381 
382         // Convert to list of string tokens
383         final List<List<String>> tokens = Lists.newArrayList();
384         for (final CoreMap sentence : annotation.get(SentencesAnnotation.class)) {
385             final List<String> sentenceTokens = Lists.newArrayList();
386             tokens.add(sentenceTokens);
387             for (final CoreLabel token : sentence.get(TokensAnnotation.class)) {
388                 final String text = ascii(token.originalText());
389                 // have to further split tokens embedding a space (e.g., a phone number) as there
390                 // is no way to encode them in the CONLL format, and if we try using a
391                 // non-breakable space then the AIDA evaluator will explode
392                 for (final String t : text.split("\\s+")) {
393                     if (!Strings.isNullOrEmpty(t)) {
394                         sentenceTokens.add(t);
395                     }
396                 }
397             }
398         }
399         return tokens;
400     }
401 
402     private static String ascii(final String string) {
403         final StringBuilder builder = new StringBuilder(string.length());
404         for (int i = 0; i < string.length(); ++i) {
405             final char ch = string.charAt(i);
406             if (ch >= 32 && ch < 127) {
407                 builder.append(ch);
408             } else if (ch == '©') {
409                 builder.append("(c)");
410             } else if (ch == '™') {
411                 builder.append("(tm)");
412             } else if (ch == '®') {
413                 builder.append("(r)");
414             } else if (ch == '•' || ch == '·') {
415                 builder.append("*");
416             } else if (ch == 'Ø') {
417                 builder.append("0");
418             } else if (ch == '‑') {
419                 builder.append("-");
420             } else if (ch == '´') {
421                 builder.append("'");
422             } else if (ch == '¨') {
423                 builder.append("\"");
424             } else if (ch == '¸' || ch == ',') {
425                 builder.append(",");
426             } else {
427                 final String s = Normalizer.normalize("" + ch, Normalizer.Form.NFD);
428                 for (final char c : s.toCharArray()) {
429                     if (c <= '\u007F') {
430                         builder.append(c);
431                     } else {
432                         // builder.append(' '); // TODO: uncomment
433                     }
434                 }
435             }
436         }
437         final String result = builder.toString();
438         if (!result.equals(string)) {
439             LOGGER.warn("Normalized {} to {}", string, result);
440         }
441         return StringEscapeUtils.unescapeXml(result);
442     }
443 
444     private static final class Document {
445 
446         final String docId;
447 
448         final String docXml;
449 
450         final Date docDate;
451 
452         final String docSource;
453 
454         final String docType;
455 
456         final String docTitle;
457 
458         final List<List<String>> docTokens;
459 
460         final List<Query> queries;
461 
462         public Document(final String docId, final String docXml, final Iterable<Query> queries) {
463 
464             // Check and store parameters
465             this.docId = Objects.requireNonNull(docId);
466             this.docXml = Objects.requireNonNull(docXml);
467             this.queries = ImmutableList.copyOf(queries);
468 
469             try {
470                 // Parse XML
471                 final org.w3c.dom.Document document = DOCUMENT_BUILDER
472                         .parse(new InputSource(new StringReader(docXml)));
473 
474                 // Extract and check document ID embedded in the XML
475                 final NodeList docIdNodes = document.getElementsByTagName("DOCID");
476                 if (docIdNodes.getLength() == 1) {
477                     final String parsedDocId = docIdNodes.item(0).getTextContent().trim();
478                     if (!docId.equals(parsedDocId)) {
479                         LOGGER.warn("DOCID XML element " + parsedDocId
480                                 + " does not match ID of document " + docId);
481                     }
482                 }
483 
484                 // Extract document date from the XML
485                 Date date = new Date();
486                 final NodeList dateNodes = document.getElementsByTagName("DATETIME");
487                 if (dateNodes.getLength() == 1) {
488                     final String dateStr = dateNodes.item(0).getTextContent().trim();
489                     try {
490                         date = XML_DATE_FORMAT.parse(dateStr);
491                     } catch (final Throwable ex) {
492                         LOGGER.warn("Could not parse <DATETIME> value " + dateStr);
493                     }
494                 } else {
495                     LOGGER.warn("No <DATETIME> XML element for document " + docId);
496                 }
497                 this.docDate = date;
498 
499                 // Extract document source and type from <DOCTYPE>
500                 final NodeList doctypeNodes = document.getElementsByTagName("DOCTYPE");
501                 if (doctypeNodes.getLength() == 1) {
502                     final Element doctypeElement = (Element) doctypeNodes.item(0);
503                     this.docSource = toNormalCase(doctypeElement.getAttribute("SOURCE").trim());
504                     this.docType = toNormalCase(doctypeElement.getTextContent().trim());
505                 } else {
506                     LOGGER.warn("No <DOCTYPE> XML element for document " + docId);
507                     this.docSource = "";
508                     this.docType = "";
509                 }
510 
511                 // Extract document title from the XML. Map all uppercase to all lowercase
512                 final NodeList headlineNodes = document.getElementsByTagName("HEADLINE");
513                 if (headlineNodes.getLength() == 1) {
514                     this.docTitle = toNormalCase(headlineNodes.item(0).getTextContent().trim());
515                 } else {
516                     this.docTitle = "";
517                     LOGGER.warn("No <HEADLINE> XML element for document " + docId);
518                 }
519 
520                 // Extract document text from the XML, by concatenating all <P> paragraphs
521                 final List<List<String>> tokens = Lists.newArrayList();
522                 if (!this.docTitle.isEmpty()) {
523                     tokens.addAll(tokenize(toSentence(this.docTitle)));
524                 }
525                 collectText(tokens, document.getElementsByTagName("TEXT"));
526                 this.docTokens = ImmutableList.copyOf(tokens);
527                 if (this.docTokens.size() <= 1) {
528                     LOGGER.warn("No text extracted for document " + docId);
529                 }
530 
531             } catch (final Throwable ex) {
532                 Throwables.throwIfUnchecked(ex);
533                 throw new RuntimeException(ex);
534             }
535         }
536 
537         private static void collectText(final List<List<String>> tokens, final NodeList nodes) {
538             for (int i = 0; i < nodes.getLength(); ++i) {
539                 final Node node = nodes.item(i);
540                 if (node instanceof Text) {
541                     final String text = node.getTextContent().trim();
542                     if (!text.isEmpty()) {
543                         final List<List<String>> paragraph = tokenize(text);
544                         tokens.add(ImmutableList.of()); // empty list for paragraph separator
545                         tokens.addAll(paragraph);
546                     }
547                 } else {
548                     collectText(tokens, node.getChildNodes());
549                 }
550             }
551         }
552 
553         private static String toNormalCase(final String string) {
554             final String result = string.toUpperCase().equals(string) ? string.toLowerCase()
555                     : string;
556             return result;
557         }
558 
559         private static String toSentence(String string) {
560             string = StringUtils.capitalize(string).trim();
561             return string.endsWith(".") ? string : string + ".";
562         }
563 
564     }
565 
566     private static final class Query {
567 
568         private static final Pattern NIL_PATTERN = Pattern.compile("NIL[0-9]+");
569 
570         final String docId;
571 
572         final String queryId;
573 
574         final String queryText;
575 
576         final String expectedNercClass;
577 
578         final String expectedNelId;
579 
580         @Nullable
581         final String expectedNelUri;
582 
583         public Query(final String docId, final String queryId, final String queryText,
584                 final String expectedNercClass, final String expectedNelId) {
585 
586             this.docId = Objects.requireNonNull(docId);
587             this.queryId = Objects.requireNonNull(queryId);
588             this.queryText = Objects.requireNonNull(queryText);
589             this.expectedNercClass = Objects.requireNonNull(expectedNercClass).replace("GPE",
590                     "LOC");
591             this.expectedNelId = Objects.requireNonNull(expectedNelId);
592             this.expectedNelUri = NIL_PATTERN.matcher(expectedNelId).matches() ? null
593                     : "http://dbpedia.org/resource/" + expectedNelId;
594         }
595 
596     }
597 
598 }