1 package eu.fbk.dkm.pikes.tintop;
2
3 import com.fasterxml.jackson.databind.DeserializationFeature;
4 import com.fasterxml.jackson.databind.ObjectMapper;
5 import com.google.common.collect.HashMultimap;
6 import com.google.common.collect.ImmutableList;
7 import com.google.common.collect.Ordering;
8 import edu.cmu.cs.lti.ark.fn.parsing.SemaforParseResult;
9 import edu.stanford.nlp.coref.CorefCoreAnnotations;
10 import edu.stanford.nlp.coref.data.CorefChain;
11 import edu.stanford.nlp.ling.CoreAnnotations;
12 import edu.stanford.nlp.ling.CoreLabel;
13 import edu.stanford.nlp.pipeline.Annotation;
14 import edu.stanford.nlp.pipeline.StanfordCoreNLP;
15 import edu.stanford.nlp.trees.CollinsHeadFinder;
16 import edu.stanford.nlp.trees.HeadFinder;
17 import edu.stanford.nlp.trees.Tree;
18 import edu.stanford.nlp.trees.TreeCoreAnnotations;
19 import edu.stanford.nlp.util.CoreMap;
20 import edu.stanford.nlp.util.IntPair;
21 import eu.fbk.dkm.pikes.resources.*;
22 import eu.fbk.dkm.pikes.resources.ontonotes.VerbNetStatisticsExtractor;
23 import eu.fbk.dkm.pikes.tintop.util.NER2SSTtagset;
24 import eu.fbk.dkm.pikes.tintop.util.NerEntity;
25 import eu.fbk.dkm.pikes.twm.LinkingTag;
26 import eu.fbk.dkm.pikes.twm.TWMAnnotations;
27 import eu.fbk.fcw.mate.MateAnnotations;
28 import eu.fbk.fcw.ner.NERConfidenceAnnotator;
29 import eu.fbk.fcw.semafor.Semafor;
30 import eu.fbk.fcw.semafor.SemaforAnnotations;
31 import eu.fbk.fcw.ukb.UKBAnnotations;
32 import eu.fbk.fcw.utils.AnnotatorUtils;
33 import eu.fbk.fcw.wnpos.WNPosAnnotations;
34 import eu.fbk.utils.core.PropertiesUtils;
35 import eu.fbk.utils.corenlp.CustomAnnotations;
36 import eu.fbk.utils.corenlp.outputters.JSONOutputter;
37 import ixa.kaflib.*;
38 import org.apache.commons.lang.StringEscapeUtils;
39 import org.apache.log4j.Logger;
40 import se.lth.cs.srl.corpus.Word;
41
42 import javax.annotation.Nullable;
43 import java.io.*;
44 import java.util.*;
45 import java.util.Map.Entry;
46
47
48
49
50
51
52
53
54
55 public class AnnotationPipeline {
56
57 static Logger logger = Logger.getLogger(AnnotationPipeline.class.getName());
58
59 enum Models {ONTONOTES, WORDNET, PREDICATE_MATRIX}
60
61 HashMap<Models, Boolean> modelsLoaded = new HashMap<>();
62
63 private PredicateMatrix PM;
64 private VerbNetStatisticsExtractor statisticsExtractor = null;
65
66 private Properties defaultConfig = new Properties();
67
68 private Map<String, String> nerMap = new HashMap<>();
69
70 public AnnotationPipeline(@Nullable File configFile, @Nullable Properties additionalProperties) throws IOException {
71 defaultConfig = new Properties();
72 if (configFile != null) {
73 InputStream input = new FileInputStream(configFile);
74 defaultConfig.load(input);
75 input.close();
76 }
77 defaultConfig.putAll(Defaults.classProperties());
78 if (additionalProperties != null) {
79 defaultConfig.putAll(additionalProperties);
80 }
81 Defaults.setNotPresent(defaultConfig);
82
83 for (Models model : Models.values()) {
84 modelsLoaded.put(model, false);
85 }
86 }
87
88 public void addToNerMap(String key, String value) {
89 nerMap.put(key, value);
90 }
91
92 public void deleteFromNerMap(String key) {
93 nerMap.remove(key);
94 }
95
96 public Properties getDefaultConfig() {
97 return defaultConfig;
98 }
99
100 public static void addHeads(Tree node) {
101 addHeads(node, null, null);
102 }
103
104 public static void addHeads(Tree node, Tree parent, HeadFinder headFinder) {
105 if (node == null || node.isLeaf()) {
106 return;
107 }
108
109 if (headFinder == null) {
110 headFinder = new CollinsHeadFinder();
111 }
112
113 Tree head = headFinder.determineHead(node, parent);
114 if (!head.isLeaf()) {
115 head.label().setValue(head.label().toString() + ixa.kaflib.Tree.HEAD_MARK);
116 }
117
118 for (Tree child : node.children()) {
119 addHeads(child, node, headFinder);
120 }
121
122 }
123
124 public void loadModels() throws Exception {
125 loadModels(getDefaultConfig());
126 }
127
128 public void loadModels(Properties properties) throws Exception {
129
130 boolean enablePM = Defaults.getBoolean(properties.getProperty("enable_predicate_matrix"), false);
131 boolean enableNafFilter = Defaults.getBoolean(properties.getProperty("enable_naf_filter"), false);
132 boolean enableOntoNotesFilter = Defaults.getBoolean(properties.getProperty("enable_on_filter"), false);
133
134 logger.info("Loading Stanford CoreNLP");
135
136 Properties stanfordFromConfig = PropertiesUtils.dotConvertedProperties(properties, "stanford");
137 StanfordCoreNLP stanfordPipeline = new StanfordCoreNLP(stanfordFromConfig);
138
139
140
141 if (enablePM && !modelsLoaded.get(Models.PREDICATE_MATRIX)) {
142 logger.info("Loading Predicate Matrix");
143 PM = new PredicateMatrix(properties.getProperty("predicate_matrix", Defaults.PREDICATE_MATRIX));
144 modelsLoaded.put(Models.PREDICATE_MATRIX, true);
145 }
146
147
148
149 if (enableNafFilter && !modelsLoaded.get(Models.WORDNET)) {
150 logger.info("Loading WordNet for NAF filter");
151 WordNet.setPath(properties.getProperty("naf_filter_wordnet_path", Defaults.WN_DICT));
152 WordNet.init();
153 modelsLoaded.put(Models.WORDNET, true);
154 }
155
156
157
158 if (enableOntoNotesFilter && !modelsLoaded.get(Models.ONTONOTES)) {
159 logger.info("Loading OntoNotes");
160 statisticsExtractor = new VerbNetStatisticsExtractor();
161
162
163 statisticsExtractor.loadFrequencies(properties.getProperty("on_frequencies", Defaults.ON_FREQUENCIES));
164 modelsLoaded.put(Models.ONTONOTES, true);
165 }
166 }
167
168 public void annotateStanford(Properties properties, Annotation document, KAFDocument NAFdocument)
169 throws IOException {
170
171 boolean enablePM = Defaults.getBoolean(properties.getProperty("enable_predicate_matrix"), false);
172 boolean enableNafFilter = Defaults.getBoolean(properties.getProperty("enable_naf_filter"), false);
173 boolean enableOntoNotesFilter = Defaults.getBoolean(properties.getProperty("enable_on_filter"), false);
174 boolean enableEntityAssignment = Defaults.getBoolean(properties.getProperty("enable_entity_assignment"), false);
175
176 Map<Integer, CorefChain> coreferenceGraph = document.get(CorefCoreAnnotations.CorefChainAnnotation.class);
177
178
179 try {
180 Timex3 tmx0 = NAFdocument.newTimex3("tmx0", "DATE");
181 tmx0.setValue(NAFdocument.getFileDesc().creationtime.substring(0, 10));
182 } catch (Exception e) {
183 logger.warn("Document creation time is not included in the NAF headers");
184 }
185
186 logger.info("Getting information");
187 TreeMap<Integer, Integer> sentIndexes = new TreeMap<>();
188 int totTokens = 0;
189 ArrayList<Term> allTerms = new ArrayList<>();
190
191 HashMap<Integer, Integer> tokenFromStart = new HashMap<>();
192 HashMap<Integer, Integer> tokenFromEnd = new HashMap<>();
193
194 HashMap<Integer, Integer> offsetToken = new HashMap<>();
195
196 ArrayList<WF> allTokens = new ArrayList<>();
197 HashMap<Integer, HashSet<LinkingTag>> keywords = new HashMap<>();
198
199 if (document.containsKey(TWMAnnotations.LinkingAnnotations.class)) {
200 for (LinkingTag e : document.get(TWMAnnotations.LinkingAnnotations.class)) {
201 int start = e.getOffset();
202 if (keywords.get(start) == null) {
203 keywords.put(start, new HashSet<LinkingTag>());
204 }
205 keywords.get(start).add(e);
206 logger.debug("Annotated entity (DS): " + e);
207 }
208 }
209
210
211 List<CoreMap> get = document.get(CoreAnnotations.SentencesAnnotation.class);
212 for (int sentIndex = 0; sentIndex < get.size(); sentIndex++) {
213 CoreMap stanfordSentence = get.get(sentIndex);
214 List<CoreLabel> tokens = stanfordSentence.get(CoreAnnotations.TokensAnnotation.class);
215
216 ArrayList<Term> terms = new ArrayList<>();
217 ArrayList<String> ners = new ArrayList<>();
218
219 sentIndexes.put(sentIndex, totTokens);
220 totTokens += tokens.size();
221
222 HashMap<Integer, TreeSet<Integer>> children = new HashMap<>();
223
224 String lastNER = "O";
225 ArrayList<NerEntity> entities = new ArrayList<>();
226
227 for (int i = 0; i < tokens.size(); i++) {
228 CoreLabel stanfordToken = tokens.get(i);
229 String form = stanfordToken.get(CoreAnnotations.TextAnnotation.class);
230 String lemma = stanfordToken.get(CoreAnnotations.LemmaAnnotation.class);
231 String pos = stanfordToken.get(CoreAnnotations.PartOfSpeechAnnotation.class);
232
233 form = AnnotatorUtils.codeToParenthesis(form);
234 if (lemma != null) {
235 lemma = AnnotatorUtils.codeToParenthesis(lemma);
236 }
237 pos = AnnotatorUtils.codeToParenthesis(pos);
238
239 children.put(i, new TreeSet<Integer>());
240
241
242 WF thisWF = NAFdocument.newWF(form, stanfordToken.beginPosition(), sentIndex + 1);
243 thisWF.setPara(1);
244
245 Integer tokenID = totTokens - tokens.size() + i;
246
247 tokenFromStart.put(stanfordToken.beginPosition(), tokenID);
248 tokenFromEnd.put(stanfordToken.beginPosition() + thisWF.getLength(), tokenID);
249
250 for(int j=stanfordToken.beginPosition();j<stanfordToken.beginPosition() + thisWF.getLength();j++) {
251
252 offsetToken.put(j, tokenID);
253 }
254
255 allTokens.add(tokenID, thisWF);
256
257
258 Span<WF> thisWFSpan = KAFDocument.newWFSpan();
259 thisWFSpan.addTarget(thisWF);
260 Term thisTerm = NAFdocument.newTerm("open", lemma, pos, thisWFSpan);
261 thisTerm.setMorphofeat(pos);
262
263
264 String upos = stanfordToken.get(CustomAnnotations.UPosAnnotation.class);
265 thisTerm.setUpos(upos);
266
267
268 String wnSense = stanfordToken.get(UKBAnnotations.UKBAnnotation.class);
269 if (wnSense != null) {
270 thisTerm.setWordnetSense(stanfordToken.get(UKBAnnotations.UKBAnnotation.class));
271 }
272
273
274 String simplePos = stanfordToken.get(WNPosAnnotations.WNPosAnnotation.class);
275 if (simplePos == null) {
276 simplePos = "O";
277 }
278 thisTerm.setPos(simplePos);
279
280
281 Map<String, Collection<String>> features = stanfordToken.get(CustomAnnotations.FeaturesAnnotation.class);
282 thisTerm.setFeatures(features);
283
284 terms.add(thisTerm);
285 allTerms.add(thisTerm);
286
287 String ne = stanfordToken.get(CoreAnnotations.NamedEntityTagAnnotation.class);
288 if (nerMap.containsKey(ne)) {
289 ne = nerMap.get(ne);
290 }
291 String normVal = stanfordToken.getString(CoreAnnotations.NormalizedNamedEntityTagAnnotation.class);
292 if (ne != null) {
293 if (ne.equals("O")) {
294 ners.add("0");
295 } else {
296
297
298 String alt = NER2SSTtagset.tagset.get(ne);
299 if (alt == null) {
300 alt = "MISC";
301 }
302
303 if (ne.equals(lastNER)) {
304 entities.get(entities.size() - 1).setEndToken(i);
305 ners.add("I-" + alt);
306 } else {
307 NerEntity newEntity = new NerEntity(ne, i, normVal);
308 newEntity.setScoredLabels(stanfordToken.get(NERConfidenceAnnotator.ScoredNamedEntityTagsAnnotation.class));
309 entities.add(newEntity);
310 ners.add("B-" + alt);
311 }
312 }
313 lastNER = ne;
314 } else {
315 ners.add("0");
316 }
317
318 }
319
320
321
322 for (int i = 0; i < tokens.size(); i++) {
323 CoreLabel stanfordToken = tokens.get(i);
324
325
326 if (!stanfordToken.containsKey(CoreAnnotations.CoNLLDepParentIndexAnnotation.class)) {
327 continue;
328 }
329
330 int head = stanfordToken.get(CoreAnnotations.CoNLLDepParentIndexAnnotation.class);
331 head++;
332 String depRel = stanfordToken.get(CoreAnnotations.CoNLLDepTypeAnnotation.class);
333 if (head != 0) {
334 Term from = terms.get(head - 1);
335 Term to = terms.get(i);
336 NAFdocument.newDep(from, to, depRel);
337 }
338
339 Word word = stanfordToken.get(MateAnnotations.MateTokenAnnotation.class);
340 if (word != null) {
341 List<Word> toRoot = Word.pathToRoot(word);
342 for (Word w : toRoot) {
343 int id = w.getIdx() - 1;
344 if (id < 0) {
345 continue;
346 }
347 children.get(id).add(i);
348 }
349 }
350 }
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415 for (NerEntity entity : entities) {
416
417 int startIndex = terms.get(entity.getStartToken()).getWFs().get(0).getOffset();
418 int endIndex = terms.get(entity.getEndToken()).getWFs()
419 .get(terms.get(entity.getEndToken()).getWFs().size() - 1).getOffset() +
420 terms.get(entity.getEndToken()).getWFs()
421 .get(terms.get(entity.getEndToken()).getWFs().size() - 1).getLength();
422
423 logger.debug("Stanford NER entity: " + entity + "");
424 logger.debug(String.format("Stanford NER entity: %s (from %d to %d)", entity.getLabel(), startIndex,
425 endIndex));
426
427 Span<Term> thisTermSpan = KAFDocument.newTermSpan();
428 Span<WF> thisWFSpan = KAFDocument.newWFSpan();
429
430 for (int i = entity.getStartToken(); i <= entity.getEndToken(); i++) {
431 thisTermSpan.addTarget(terms.get(i));
432 thisWFSpan.addTargets(terms.get(i).getWFs());
433 }
434
435 List<Span<Term>> thisTermList = new LinkedList<>();
436 List<Span<WF>> thisWFList = new LinkedList<>();
437
438 thisTermList.add(thisTermSpan);
439 thisWFList.add(thisWFSpan);
440
441 Entity thisEntity = null;
442 Timex3 thisTimex = null;
443
444 entity.setLabel(entity.getLabel().toUpperCase());
445
446 switch (entity.getLabel()) {
447 case "PERSON":
448 case "LOCATION":
449 case "ORGANIZATION":
450
451 case "MISC":
452 case "MONEY":
453 case "PERCENT":
454
455
456 case "PER":
457 case "LOC":
458 case "ORG":
459
460 thisEntity = NAFdocument.newEntity(thisTermList);
461 String entityLabel = entity.getLabel().replace("PERSON","PER").replace("ORGANIZATION","ORG").replace("LOCATION","LOC");
462 thisEntity.setType(entityLabel);
463
464
465 if (entity.getNormalizedValue() != null && entity.getNormalizedValue().length() > 0) {
466 thisEntity.addExternalRef(NAFdocument.createExternalRef("value", entity.getNormalizedValue()));
467 }
468
469 if (enableEntityAssignment) {
470 LinkingTag e = null;
471 HashSet<LinkingTag> possibleEntities = keywords.get(startIndex);
472 if (possibleEntities != null) {
473 for (LinkingTag loopEntity : possibleEntities) {
474 int end = loopEntity.getOffset() + loopEntity.getLength();
475 if (end != endIndex) {
476 continue;
477 }
478 if (e == null || e.getScore() < loopEntity.getScore()) {
479 e = loopEntity;
480 }
481 }
482 }
483
484 if (e != null) {
485 ExternalRef ext = NAFdocument.newExternalRef(e.getSource(), e.getPage());
486 ext.setConfidence((float) e.getScore());
487 thisEntity.addExternalRef(ext);
488 }
489 }
490
491 break;
492
493 case "NUMBER":
494 thisEntity = NAFdocument.newEntity(thisTermList);
495 thisEntity.setType("CARDINAL");
496 thisEntity.addExternalRef(NAFdocument.createExternalRef("value", entity.getNormalizedValue()));
497 break;
498
499 case "ORDINAL":
500 thisEntity = NAFdocument.newEntity(thisTermList);
501 thisEntity.setType("ORDINAL");
502 thisEntity.addExternalRef(NAFdocument.createExternalRef("value", entity.getNormalizedValue()));
503 break;
504
505 case "DATE":
506 case "TIME":
507 thisTimex = NAFdocument.newTimex3(thisWFSpan, entity.getLabel());
508 thisTimex.setValue(entity.getNormalizedValue());
509 break;
510
511 case "DURATION":
512 thisTimex = NAFdocument.newTimex3(thisWFSpan, entity.getLabel());
513 thisTimex.setValue(entity.getNormalizedValue());
514 break;
515
516 default:
517 logger.debug(entity.getLabel());
518 }
519
520 if (thisEntity != null && entity.getScoredLabels() != null) {
521 for (Entry<String, Double> entry : entity.getScoredLabels().entrySet()) {
522 ExternalRef ref = NAFdocument.createExternalRef("value-confidence",
523 entry.getKey().replace("PERSON","PER").replace("ORGANIZATION","ORG").replace("LOCATION","LOC"));
524 ref.setConfidence(entry.getValue().floatValue());
525 thisEntity.addExternalRef(ref);
526 }
527 }
528 }
529
530 for (int i = 0; i < tokens.size(); i++) {
531 CoreLabel stanfordToken = tokens.get(i);
532
533 se.lth.cs.srl.corpus.Predicate predicate = stanfordToken.get(MateAnnotations.MateAnnotation.class);
534 if (predicate != null) {
535 Span<Term> thisTermSpan = KAFDocument.newTermSpan();
536 Term thisTerm = terms.get(predicate.getIdx() - 1);
537 String mateSense = predicate.getSense();
538
539 thisTermSpan.addTarget(thisTerm);
540
541 Predicate newPred = NAFdocument.newPredicate(thisTermSpan);
542 newPred.setSource("mate");
543
544
545 boolean verb =true;
546 String sense = null;
547
548
549 ExternalRef e;
550
551
552 if (thisTerm.getPos().equals("V")) {
553 e = NAFdocument.newExternalRef("PropBank", mateSense);
554 e.setSource("mate");
555 sense = mateSense;
556 } else {
557 verb=false;
558 e = NAFdocument.newExternalRef("NomBank", mateSense);
559 e.setSource("mate");
560
561 NomBank.Roleset roleset = NomBank.getRoleset(mateSense);
562 try {
563 sense = roleset.getPBId();
564 } catch (Exception ex) {
565 logger.debug(ex.getMessage());
566 }
567 }
568 newPred.addExternalRef(e);
569
570
571
572 ArrayList<String> vnClasses = new ArrayList<>();
573 ArrayList<String> fnFrames = new ArrayList<>();
574
575 if (enablePM) {
576 if (sense != null && sense.length() > 0) {
577
578 HashSet<String> vnToAdd = new HashSet<>();
579 String vnFinal = null;
580
581
582 vnClasses = PM.getVNClasses(sense);
583 if (!vnClasses.isEmpty()) {
584 if (vnClasses.size() == 1 || !enableOntoNotesFilter) {
585 for (String vnClass1 : vnClasses) {
586 vnToAdd.add(vnClass1);
587 vnFinal = vnClass1;
588 }
589 } else {
590 Integer value = 0;
591
592 for (String vnClass : vnClasses) {
593 Integer thisValue = statisticsExtractor.getVnTotals().get(vnClass);
594 thisValue = thisValue == null ? 0 : thisValue;
595 if (thisValue >= value) {
596 vnFinal = vnClass;
597 value = thisValue;
598 }
599 }
600
601
602 vnClasses = new ArrayList<>();
603
604 if (vnFinal != null) {
605 vnToAdd.add(vnFinal);
606 vnClasses.add(vnFinal);
607 }
608 }
609 }
610
611 ArrayList<String> vnSubClasses = PM.getVNSubClasses(sense);
612 if (!vnSubClasses.isEmpty()) {
613 for (String vnSubClass1 : vnSubClasses) {
614 for (String vnClass : vnClasses) {
615 if (!vnSubClass1.startsWith(vnClass)) {
616 continue;
617 }
618
619 vnToAdd.add(vnSubClass1);
620
621
622 if (vnFinal != null) {
623 if (vnSubClass1.startsWith(vnFinal)) {
624 vnToAdd.remove(vnFinal);
625 }
626 }
627 }
628 }
629 }
630
631 for (String vnClass1 : vnToAdd) {
632 ExternalRef vnClass = NAFdocument.newExternalRef("VerbNet", vnClass1);
633 vnClass.setSource("mate+pm");
634 newPred.addExternalRef(vnClass);
635 }
636
637
638 vnClasses.clear();
639 vnClasses.addAll(vnToAdd);
640
641
642 fnFrames = PM.getFNFrames(sense);
643
644 if (enableOntoNotesFilter) {
645 HashSet<String> possibleFrames = new HashSet<>();
646 for (String vnClass : vnClasses) {
647 possibleFrames.addAll(PM.getVNClassesToFN(vnClass));
648 }
649
650
651
652 fnFrames.retainAll(possibleFrames);
653
654
655 }
656
657 if (!fnFrames.isEmpty()) {
658 if (fnFrames.size() == 1 || !enableOntoNotesFilter) {
659 for (String fnFrame1 : fnFrames) {
660 ExternalRef fnFrame = NAFdocument.newExternalRef("FrameNet", fnFrame1);
661 fnFrame.setSource("mate+pm");
662 newPred.addExternalRef(fnFrame);
663 }
664 } else {
665 Integer value = 0;
666 String fnFinal = null;
667
668 for (String fnFrame : fnFrames) {
669 Integer thisValue = statisticsExtractor.getFnTotals()
670 .get(fnFrame.toLowerCase());
671 thisValue = thisValue == null ? 0 : thisValue;
672 if (thisValue >= value) {
673 fnFinal = fnFrame;
674 value = thisValue;
675 }
676 }
677
678
679 fnFrames = new ArrayList<>();
680
681 if (fnFinal != null) {
682 ExternalRef fnFrame = NAFdocument.newExternalRef("FrameNet", fnFinal);
683 fnFrame.setSource("mate+pm");
684 newPred.addExternalRef(fnFrame);
685 fnFrames.add(fnFinal);
686 }
687 }
688 }
689
690 if (!verb){
691
692 ArrayList<String> pbPredicates = PM.getPBPredicates(sense);
693 if (!pbPredicates.isEmpty()) {
694 for (String pbPredicate1 : pbPredicates) {
695 ExternalRef pbPredicate = NAFdocument.newExternalRef("PropBank", pbPredicate1);
696 pbPredicate.setSource("mate+nb");
697 newPred.addExternalRef(pbPredicate);
698 }
699 }
700 }
701
702
703 ArrayList<String> esoClasses = PM.getESOClasses(sense);
704 if (!esoClasses.isEmpty()) {
705 for (String esoClass1 : esoClasses) {
706 ExternalRef esoClass = NAFdocument.newExternalRef("ESO", esoClass1);
707 esoClass.setSource("mate+pm");
708 newPred.addExternalRef(esoClass);
709 }
710 }
711
712
713
714
715
716
717
718
719
720
721
722
723 ArrayList<String> wnSenses = PM.getWNSenses(sense);
724 if (!wnSenses.isEmpty()) {
725 for (String wnSense1 : wnSenses) {
726 ExternalRef wnSense = NAFdocument.newExternalRef("WordNet", wnSense1);
727 wnSense.setSource("mate+pm");
728 newPred.addExternalRef(wnSense);
729 }
730 }
731
732 }
733 }
734
735 for (Word w : predicate.getArgMap().keySet()) {
736 Span<Term> thisTermSpanForRole = KAFDocument.newTermSpan();
737 for (int k : children.get(w.getIdx() - 1)) {
738 thisTermSpanForRole.addTarget(terms.get(k));
739 }
740 thisTermSpanForRole.setHead(terms.get(w.getIdx() - 1));
741
742 String argument = predicate.getArgMap().get(w);
743 Predicate.Role newRole = NAFdocument.newRole(newPred, argument, thisTermSpanForRole);
744 ExternalRef mateRoleRef;
745 if(verb){
746
747 mateRoleRef = NAFdocument
748 .newExternalRef("PropBank", mateSense+"@"+argument);
749 mateRoleRef.setSource("mate");
750
751
752 } else {
753
754 mateRoleRef = NAFdocument
755 .newExternalRef("NomBank", mateSense+"@"+argument);
756 mateRoleRef.setSource("mate");
757 }
758
759 newRole.addExternalRef(mateRoleRef);
760
761
762 if (enablePM && PM != null && statisticsExtractor != null) {
763
764
765 ArrayList<String> vnThematicRoles = PM.getVNThematicRoles(sense + "@" + argument);
766 if (!vnThematicRoles.isEmpty()) {
767 for (String vnThematicRole1 : vnThematicRoles) {
768 if (!enableOntoNotesFilter) {
769 ExternalRef vnThematicRole = NAFdocument
770 .newExternalRef("VerbNet", vnThematicRole1);
771 vnThematicRole.setSource("mate+pm");
772 newRole.addExternalRef(vnThematicRole);
773 } else {
774 String[] parts = vnThematicRole1.split("@");
775 if (vnClasses.contains(parts[0])) {
776 ExternalRef vnThematicRole = NAFdocument
777 .newExternalRef("VerbNet", vnThematicRole1);
778 vnThematicRole.setSource("mate+pm");
779 newRole.addExternalRef(vnThematicRole);
780 }
781 }
782 }
783 }
784
785
786 ArrayList<String> fnFrameElements = PM.getFNFrameElements(sense + "@" + argument);
787 if (!fnFrameElements.isEmpty()) {
788 for (String fnFrameElement1 : fnFrameElements) {
789 if (!enableOntoNotesFilter) {
790 ExternalRef fnFrameElement = NAFdocument
791 .newExternalRef("FrameNet", fnFrameElement1);
792 fnFrameElement.setSource("mate+pm");
793 newRole.addExternalRef(fnFrameElement);
794 } else {
795 String[] parts = fnFrameElement1.split("@");
796 if (fnFrames.contains(parts[0])) {
797 ExternalRef fnFrameElement = NAFdocument
798 .newExternalRef("FrameNet", fnFrameElement1);
799 fnFrameElement.setSource("mate+pm");
800 newRole.addExternalRef(fnFrameElement);
801 }
802 }
803 }
804 }
805
806
807 if (!verb) {
808 ArrayList<String> pbArguments = PM.getPBArguments(sense + "@" + argument);
809 if (!pbArguments.isEmpty()) {
810 for (String pbArgument1 : pbArguments) {
811 ExternalRef pbArgument = NAFdocument.newExternalRef("PropBank", pbArgument1);
812 pbArgument.setSource("mate+pm");
813 newRole.addExternalRef(pbArgument);
814 }
815 }
816 }
817
818 ArrayList<String> esoRoles = PM.getESORoles(sense + "@" + argument);
819 if (!esoRoles.isEmpty()) {
820 for (String esoRole1 : esoRoles) {
821 ExternalRef esoRole = NAFdocument.newExternalRef("ESO", esoRole1);
822 esoRole.setSource("mate+pm");
823 newRole.addExternalRef(esoRole);
824 }
825 }
826 }
827
828 newPred.addRole(newRole);
829 }
830
831 }
832 }
833
834 if (stanfordSentence.containsKey(SemaforAnnotations.SemaforAnnotation.class)) {
835 SemaforParseResult semaforParseResult = stanfordSentence.get(SemaforAnnotations.SemaforAnnotation.class);
836 ObjectMapper mapper = new ObjectMapper();
837
838 mapper.disable(DeserializationFeature.FAIL_ON_UNKNOWN_PROPERTIES);
839 Semafor.SemaforResponse semaforResponse = mapper
840 .readValue(semaforParseResult.toJson(), Semafor.SemaforResponse.class);
841 for (Semafor.SemaforFrame semaforFrame : semaforResponse.getFrames()) {
842 Semafor.SemaforAnnotation semaforTarget = semaforFrame.getTarget();
843 if (semaforTarget == null) {
844 continue;
845 }
846 String frameName = semaforTarget.getName();
847
848 if (semaforTarget.getSpans().size() == 0) {
849 continue;
850 }
851 if (semaforFrame.getAnnotationSets().size() == 0) {
852 continue;
853 }
854
855 Semafor.SemaforSpan semaforSpan = semaforTarget.getSpans().get(0);
856 Semafor.SemaforSet semaforAnnotation = semaforFrame.getAnnotationSets().get(0);
857
858 Span<Term> termSpan = KAFDocument.newTermSpan();
859 for (int i = semaforSpan.getStart(); i < semaforSpan.getEnd(); i++) {
860 termSpan.addTarget(terms.get(i));
861 }
862
863 if (termSpan.size() == 0) {
864 continue;
865 }
866
867 Predicate predicate = NAFdocument.newPredicate(termSpan);
868 predicate.setSource("semafor");
869 predicate.setConfidence(semaforAnnotation.getScore());
870 ExternalRef frameNameExt = NAFdocument.createExternalRef("FrameNet", frameName);
871 frameNameExt.setSource("semafor");
872 predicate.addExternalRef(frameNameExt);
873
874 predicate.setId("f_" + predicate.getId());
875
876 for (Semafor.SemaforAnnotation frameAnnotation : semaforAnnotation.getFrameElements()) {
877 Semafor.SemaforSpan roleSpan = frameAnnotation.getSpans().get(0);
878 String roleName = frameAnnotation.getName();
879
880 Span<Term> roleTermSpan = KAFDocument.newTermSpan();
881 for (int i = roleSpan.getStart(); i < roleSpan.getEnd(); i++) {
882 roleTermSpan.addTarget(terms.get(i));
883 }
884
885 if (roleTermSpan.size() == 0) {
886 continue;
887 }
888
889 Predicate.Role role = NAFdocument.newRole(predicate, "", roleTermSpan);
890
891
892 final Term head = NAFUtils.extractHead(NAFdocument, role.getSpan());
893 if (head != null) {
894 final Span<Term> newSpan = KAFDocument
895 .newTermSpan(Ordering.from(Term.OFFSET_COMPARATOR).sortedCopy(
896 NAFdocument.getTermsByDepAncestors(ImmutableList.of(head))));
897 role.setSpan(newSpan);
898 }
899 ExternalRef roleNameExt = NAFdocument.createExternalRef("FrameNet", frameName + "@" + roleName);
900 roleNameExt.setSource("semafor");
901 role.addExternalRef(roleNameExt);
902
903 predicate.addRole(role);
904 }
905
906 }
907 }
908
909
910 Tree tree = stanfordSentence.get(TreeCoreAnnotations.TreeAnnotation.class);
911 if (tree != null) {
912 NAFdocument.addConstituencyString(tree.toString(), sentIndex + 1);
913 try {
914 logger.debug("Tree: " + tree.toString());
915
916 addHeads(tree);
917 NAFdocument.addConstituencyFromParentheses(tree.toString(), sentIndex + 1);
918 } catch (Exception e) {
919 logger.info("Tree: " + tree.toString());
920 logger.warn(e.getMessage());
921 e.printStackTrace();
922 }
923 }
924
925 }
926
927
928 for (Integer startIndex : keywords.keySet()) {
929 for (LinkingTag e : keywords.get(startIndex)) {
930
931 int end = e.getOffset() + e.getLength()-1;
932
933 Integer startToken = offsetToken.get(e.getOffset());
934
935 Integer endToken = offsetToken.get(end);
936 Span<WF> span = KAFDocument.newWFSpan();
937 if (startToken != null && endToken != null) {
938 for (int j = startToken; j <= endToken; j++) {
939 span.addTarget(allTokens.get(j));
940 }
941
942 try {
943 LinkedEntity linkedEntity = NAFdocument.newLinkedEntity(span);
944 linkedEntity.setConfidence(e.getScore());
945 linkedEntity.setReference(e.getPage());
946 linkedEntity.setResource(e.getSource());
947 linkedEntity.setTypes(e.getStringTypes());
948 linkedEntity.setSpotted(e.isSpotted());
949 } catch (Exception err) {
950 logger.error("Error on adding linkedEntity: " + err.getMessage());
951 }
952 }
953 }
954 }
955
956
957
958
959 HashMultimap<Integer, Integer> simpleCoref = document.get(CustomAnnotations.SimpleCorefAnnotation.class);
960 if (simpleCoref != null) {
961 List<Span<Term>> mentions = new ArrayList<>();
962
963 for (Integer sentenceID : simpleCoref.keySet()) {
964 TreeSet<Integer> sortedSet = new TreeSet<>();
965 sortedSet.addAll(simpleCoref.get(sentenceID));
966
967 Span<Term> thisTermSpan = KAFDocument.newTermSpan();
968 int lastTokenID = -1;
969
970 for (Integer tokenID : sortedSet) {
971 tokenID = tokenID - 1;
972 int sentenceStartTokenIndex = sentIndexes.get(sentenceID);
973 int id = sentenceStartTokenIndex + tokenID;
974 if (tokenID - lastTokenID > 1) {
975 if (thisTermSpan.size() > 0) {
976 mentions.add(thisTermSpan);
977 }
978 thisTermSpan = KAFDocument.newTermSpan();
979 }
980 thisTermSpan.addTarget(allTerms.get(id));
981 lastTokenID = tokenID;
982 }
983 if (thisTermSpan.size() > 0) {
984 mentions.add(thisTermSpan);
985 }
986 }
987
988 if (mentions.size() > 0) {
989 NAFdocument.newCoref(mentions);
990 }
991 }
992
993
994 if (coreferenceGraph != null) {
995 for (Object c : coreferenceGraph.keySet()) {
996
997 CorefChain chain = coreferenceGraph.get(c);
998 Map<IntPair, Set<CorefChain.CorefMention>> mentionMap = chain.getMentionMap();
999
1000 List<Span<Term>> mentions = new ArrayList<>();
1001
1002
1003 for (IntPair p : mentionMap.keySet()) {
1004
1005 Set<CorefChain.CorefMention> corefMentions = mentionMap.get(p);
1006 if (corefMentions.size() < 2) {
1007 continue;
1008 }
1009
1010
1011 for (CorefChain.CorefMention m : corefMentions) {
1012
1013 int sentenceStartTokenIndex = sentIndexes.get(m.sentNum - 1);
1014 int start = sentenceStartTokenIndex + m.startIndex - 1;
1015
1016 Span<Term> thisTermSpan = KAFDocument.newTermSpan();
1017 for (int i = start; i < start + m.endIndex - m.startIndex; i++) {
1018 thisTermSpan.addTarget(allTerms.get(i));
1019 }
1020
1021 if (thisTermSpan.size() > 0) {
1022 mentions.add(thisTermSpan);
1023 }
1024 }
1025 }
1026
1027 if (mentions.size() > 0) {
1028 NAFdocument.newCoref(mentions);
1029 }
1030 }
1031 }
1032
1033
1034 if (enableNafFilter) {
1035 logger.info("Applying NAF filter");
1036 Properties nafFilterConfig = PropertiesUtils.dotConvertedProperties(properties, "filter");
1037
1038 LinguisticProcessor linguisticProcessor = new LinguisticProcessor("naf-filter", "NAF filter");
1039 linguisticProcessor.setBeginTimestamp();
1040 try {
1041 NAFFilter filter = NAFFilter.builder().withProperties(properties, "filter").build();
1042 filter.filter(NAFdocument);
1043
1044
1045
1046
1047
1048
1049
1050 } catch (Exception e) {
1051 logger.error("Error applying NAF filter");
1052 }
1053 linguisticProcessor.setEndTimestamp();
1054 NAFdocument.addLinguisticProcessor(linguisticProcessor.getLayer(), linguisticProcessor);
1055 }
1056 }
1057
1058 private KAFDocument parseAll(KAFDocument NAFdocument) throws Exception {
1059 return parseAll(NAFdocument, new Properties());
1060 }
1061
1062 private KAFDocument parseAll(KAFDocument NAFdocument, Properties merge) throws Exception {
1063
1064 String text = NAFdocument.getRawText();
1065 text = StringEscapeUtils.unescapeHtml(text);
1066
1067 Properties properties = getDefaultConfig();
1068 properties.putAll(merge);
1069
1070 String maxTextLen = properties.getProperty("max_text_len");
1071 int limit = Integer.parseInt(maxTextLen);
1072 if (text.length() > limit) {
1073 throw new Exception(String.format("Input too long (%d chars, limit is %d)", text.length(), limit));
1074 }
1075
1076 loadModels(properties);
1077 Properties stanfordConfig = PropertiesUtils.dotConvertedProperties(properties, "stanford");
1078
1079
1080
1081 Properties thisSessionProps = stanfordConfig;
1082 StanfordCoreNLP thisPipeline = new StanfordCoreNLP(thisSessionProps);
1083
1084
1085 logger.info("Annotating with Stanford CoreNLP");
1086 LinguisticProcessor linguisticProcessor = new LinguisticProcessor("text", "Stanford CoreNLP");
1087 linguisticProcessor.setBeginTimestamp();
1088 Annotation document = new Annotation(text);
1089 document.set(CoreAnnotations.DocDateAnnotation.class, NAFdocument.getFileDesc().creationtime);
1090 if (NAFdocument.getFileDesc().title != null) {
1091 document.set(CoreAnnotations.DocTitleAnnotation.class, NAFdocument.getFileDesc().title);
1092 }
1093 thisPipeline.annotate(document);
1094 logger.info(thisPipeline.timingInformation());
1095 linguisticProcessor.setEndTimestamp();
1096 NAFdocument.addLinguisticProcessor(linguisticProcessor.getLayer(), linguisticProcessor);
1097
1098 annotateStanford(properties, document, NAFdocument);
1099
1100 logger.info("Parsing finished");
1101 return NAFdocument;
1102 }
1103
1104 public KAFDocument parseFromNAF(KAFDocument NAFdocument) throws Exception {
1105
1106 NAFdocument = parseAll(NAFdocument);
1107
1108 return NAFdocument;
1109 }
1110
1111 public KAFDocument parseFromString(String textInNafFormat) throws Exception {
1112 logger.debug("Parsing of NAF");
1113
1114 InputStream is = new ByteArrayInputStream(textInNafFormat.getBytes());
1115 BufferedReader br = new BufferedReader(new InputStreamReader(is));
1116 KAFDocument NAFdocument = KAFDocument.createFromStream(br);
1117
1118 try {
1119 logger.info("Document: " + NAFdocument.getFileDesc().filename);
1120 logger.info("Title: " + NAFdocument.getFileDesc().title);
1121
1122 } catch (Exception e) {
1123 logger.error(e.getMessage());
1124 }
1125
1126 NAFdocument = parseAll(NAFdocument);
1127
1128 return NAFdocument;
1129 }
1130
1131 }