1   package eu.fbk.dkm.pikes.tintop;
2   
3   import com.fasterxml.jackson.databind.DeserializationFeature;
4   import com.fasterxml.jackson.databind.ObjectMapper;
5   import com.google.common.collect.HashMultimap;
6   import com.google.common.collect.ImmutableList;
7   import com.google.common.collect.Ordering;
8   import edu.cmu.cs.lti.ark.fn.parsing.SemaforParseResult;
9   import edu.stanford.nlp.coref.CorefCoreAnnotations;
10  import edu.stanford.nlp.coref.data.CorefChain;
11  import edu.stanford.nlp.ling.CoreAnnotations;
12  import edu.stanford.nlp.ling.CoreLabel;
13  import edu.stanford.nlp.pipeline.Annotation;
14  import edu.stanford.nlp.pipeline.StanfordCoreNLP;
15  import edu.stanford.nlp.trees.CollinsHeadFinder;
16  import edu.stanford.nlp.trees.HeadFinder;
17  import edu.stanford.nlp.trees.Tree;
18  import edu.stanford.nlp.trees.TreeCoreAnnotations;
19  import edu.stanford.nlp.util.CoreMap;
20  import edu.stanford.nlp.util.IntPair;
21  import eu.fbk.dkm.pikes.resources.*;
22  import eu.fbk.dkm.pikes.resources.ontonotes.VerbNetStatisticsExtractor;
23  import eu.fbk.dkm.pikes.tintop.util.NER2SSTtagset;
24  import eu.fbk.dkm.pikes.tintop.util.NerEntity;
25  import eu.fbk.dkm.pikes.twm.LinkingTag;
26  import eu.fbk.dkm.pikes.twm.TWMAnnotations;
27  import eu.fbk.fcw.mate.MateAnnotations;
28  import eu.fbk.fcw.ner.NERConfidenceAnnotator;
29  import eu.fbk.fcw.semafor.Semafor;
30  import eu.fbk.fcw.semafor.SemaforAnnotations;
31  import eu.fbk.fcw.ukb.UKBAnnotations;
32  import eu.fbk.fcw.utils.AnnotatorUtils;
33  import eu.fbk.fcw.wnpos.WNPosAnnotations;
34  import eu.fbk.utils.core.PropertiesUtils;
35  import eu.fbk.utils.corenlp.CustomAnnotations;
36  import eu.fbk.utils.corenlp.outputters.JSONOutputter;
37  import ixa.kaflib.*;
38  import org.apache.commons.lang.StringEscapeUtils;
39  import org.apache.log4j.Logger;
40  import se.lth.cs.srl.corpus.Word;
41  
42  import javax.annotation.Nullable;
43  import java.io.*;
44  import java.util.*;
45  import java.util.Map.Entry;
46  
47  /**
48   * Created with IntelliJ IDEA.
49   * User: alessio
50   * Date: 21/07/14
51   * Time: 12:48
52   * To change this template use File | Settings | File Templates.
53   */
54  
55  public class AnnotationPipeline {
56  
57      static Logger logger = Logger.getLogger(AnnotationPipeline.class.getName());
58  
59      enum Models {ONTONOTES, WORDNET, PREDICATE_MATRIX}
60  
61      HashMap<Models, Boolean> modelsLoaded = new HashMap<>();
62  
63      private PredicateMatrix PM;
64      private VerbNetStatisticsExtractor statisticsExtractor = null;
65  
66      private Properties defaultConfig = new Properties();
67  
68      private Map<String, String> nerMap = new HashMap<>();
69  
70      public AnnotationPipeline(@Nullable File configFile, @Nullable Properties additionalProperties) throws IOException {
71          defaultConfig = new Properties();
72          if (configFile != null) {
73              InputStream input = new FileInputStream(configFile);
74              defaultConfig.load(input);
75              input.close();
76          }
77          defaultConfig.putAll(Defaults.classProperties());
78          if (additionalProperties != null) {
79              defaultConfig.putAll(additionalProperties);
80          }
81          Defaults.setNotPresent(defaultConfig);
82  
83          for (Models model : Models.values()) {
84              modelsLoaded.put(model, false);
85          }
86      }
87  
88      public void addToNerMap(String key, String value) {
89          nerMap.put(key, value);
90      }
91  
92      public void deleteFromNerMap(String key) {
93          nerMap.remove(key);
94      }
95  
96      public Properties getDefaultConfig() {
97          return defaultConfig;
98      }
99  
100     public static void addHeads(Tree node) {
101         addHeads(node, null, null);
102     }
103 
104     public static void addHeads(Tree node, Tree parent, HeadFinder headFinder) {
105         if (node == null || node.isLeaf()) {
106             return;
107         }
108 
109         if (headFinder == null) {
110             headFinder = new CollinsHeadFinder();
111         }
112 
113         Tree head = headFinder.determineHead(node, parent);
114         if (!head.isLeaf()) {
115             head.label().setValue(head.label().toString() + ixa.kaflib.Tree.HEAD_MARK);
116         }
117 
118         for (Tree child : node.children()) {
119             addHeads(child, node, headFinder);
120         }
121 
122     }
123 
124     public void loadModels() throws Exception {
125         loadModels(getDefaultConfig());
126     }
127 
128     public void loadModels(Properties properties) throws Exception {
129 
130         boolean enablePM = Defaults.getBoolean(properties.getProperty("enable_predicate_matrix"), false);
131         boolean enableNafFilter = Defaults.getBoolean(properties.getProperty("enable_naf_filter"), false);
132         boolean enableOntoNotesFilter = Defaults.getBoolean(properties.getProperty("enable_on_filter"), false);
133 
134         logger.info("Loading Stanford CoreNLP");
135 
136         Properties stanfordFromConfig = PropertiesUtils.dotConvertedProperties(properties, "stanford");
137         StanfordCoreNLP stanfordPipeline = new StanfordCoreNLP(stanfordFromConfig);
138 
139         // Predicate Matrix
140 
141         if (enablePM && !modelsLoaded.get(Models.PREDICATE_MATRIX)) {
142             logger.info("Loading Predicate Matrix");
143             PM = new PredicateMatrix(properties.getProperty("predicate_matrix", Defaults.PREDICATE_MATRIX));
144             modelsLoaded.put(Models.PREDICATE_MATRIX, true);
145         }
146 
147         // NAF filter
148 
149         if (enableNafFilter && !modelsLoaded.get(Models.WORDNET)) {
150             logger.info("Loading WordNet for NAF filter");
151             WordNet.setPath(properties.getProperty("naf_filter_wordnet_path", Defaults.WN_DICT));
152             WordNet.init();
153             modelsLoaded.put(Models.WORDNET, true);
154         }
155 
156         // OntoNotes
157 
158         if (enableOntoNotesFilter && !modelsLoaded.get(Models.ONTONOTES)) {
159             logger.info("Loading OntoNotes");
160             statisticsExtractor = new VerbNetStatisticsExtractor();
161 //			statisticsExtractor.loadDir(config.getProperty("on_folder"));
162 //			statisticsExtractor.loadFrequencies();
163             statisticsExtractor.loadFrequencies(properties.getProperty("on_frequencies", Defaults.ON_FREQUENCIES));
164             modelsLoaded.put(Models.ONTONOTES, true);
165         }
166     }
167 
168     public void annotateStanford(Properties properties, Annotation document, KAFDocument NAFdocument)
169             throws IOException {
170 
171         boolean enablePM = Defaults.getBoolean(properties.getProperty("enable_predicate_matrix"), false);
172         boolean enableNafFilter = Defaults.getBoolean(properties.getProperty("enable_naf_filter"), false);
173         boolean enableOntoNotesFilter = Defaults.getBoolean(properties.getProperty("enable_on_filter"), false);
174         boolean enableEntityAssignment = Defaults.getBoolean(properties.getProperty("enable_entity_assignment"), false);
175 
176         Map<Integer, CorefChain> coreferenceGraph = document.get(CorefCoreAnnotations.CorefChainAnnotation.class);
177 
178         // Add tmx0
179         try {
180             Timex3 tmx0 = NAFdocument.newTimex3("tmx0", "DATE");
181             tmx0.setValue(NAFdocument.getFileDesc().creationtime.substring(0, 10));
182         } catch (Exception e) {
183             logger.warn("Document creation time is not included in the NAF headers");
184         }
185 
186         logger.info("Getting information");
187         TreeMap<Integer, Integer> sentIndexes = new TreeMap<>();
188         int totTokens = 0;
189         ArrayList<Term> allTerms = new ArrayList<>();
190 
191         HashMap<Integer, Integer> tokenFromStart = new HashMap<>();
192         HashMap<Integer, Integer> tokenFromEnd = new HashMap<>();
193 
194         ArrayList<WF> allTokens = new ArrayList<>();
195         HashMap<Integer, HashSet<LinkingTag>> keywords = new HashMap<>();
196 
197         if (document.containsKey(TWMAnnotations.LinkingAnnotations.class)) {
198             for (LinkingTag e : document.get(TWMAnnotations.LinkingAnnotations.class)) {
199                 int start = e.getOffset();
200                 if (keywords.get(start) == null) {
201                     keywords.put(start, new HashSet<LinkingTag>());
202                 }
203                 keywords.get(start).add(e);
204                 logger.debug("Annotated entity (DS): " + e);
205             }
206         }
207 
208         // Main loop
209         List<CoreMap> get = document.get(CoreAnnotations.SentencesAnnotation.class);
210         for (int sentIndex = 0; sentIndex < get.size(); sentIndex++) {
211             CoreMap stanfordSentence = get.get(sentIndex);
212             List<CoreLabel> tokens = stanfordSentence.get(CoreAnnotations.TokensAnnotation.class);
213 
214             ArrayList<Term> terms = new ArrayList<>();
215             ArrayList<String> ners = new ArrayList<>();
216 
217             sentIndexes.put(sentIndex, totTokens);
218             totTokens += tokens.size();
219 
220             HashMap<Integer, TreeSet<Integer>> children = new HashMap<>();
221 
222             String lastNER = "O";
223             ArrayList<NerEntity> entities = new ArrayList<>();
224 
225             for (int i = 0; i < tokens.size(); i++) {
226                 CoreLabel stanfordToken = tokens.get(i);
227                 String form = stanfordToken.get(CoreAnnotations.TextAnnotation.class);
228                 String lemma = stanfordToken.get(CoreAnnotations.LemmaAnnotation.class);
229                 String pos = stanfordToken.get(CoreAnnotations.PartOfSpeechAnnotation.class);
230 
231                 form = AnnotatorUtils.codeToParenthesis(form);
232                 if (lemma != null) {
233                     lemma = AnnotatorUtils.codeToParenthesis(lemma);
234                 }
235                 pos = AnnotatorUtils.codeToParenthesis(pos);
236 
237                 children.put(i, new TreeSet<Integer>());
238 
239                 // Tokens
240                 WF thisWF = NAFdocument.newWF(form, stanfordToken.beginPosition(), sentIndex + 1);
241                 thisWF.setPara(1); //todo: Always set paragraph 1
242 
243                 Integer tokenID = totTokens - tokens.size() + i;
244 
245                 tokenFromStart.put(stanfordToken.beginPosition(), tokenID);
246                 tokenFromEnd.put(stanfordToken.beginPosition() + thisWF.getLength(), tokenID);
247                 allTokens.add(tokenID, thisWF);
248 
249                 // Term
250                 Span<WF> thisWFSpan = KAFDocument.newWFSpan();
251                 thisWFSpan.addTarget(thisWF);
252                 Term thisTerm = NAFdocument.newTerm("open", lemma, pos, thisWFSpan);
253                 thisTerm.setMorphofeat(pos);
254 
255                 // Upos
256                 String upos = stanfordToken.get(CustomAnnotations.UPosAnnotation.class);
257                 thisTerm.setUpos(upos);
258 
259                 // WordNet sense
260                 String wnSense = stanfordToken.get(UKBAnnotations.UKBAnnotation.class);
261                 if (wnSense != null) {
262                     thisTerm.setWordnetSense(stanfordToken.get(UKBAnnotations.UKBAnnotation.class));
263                 }
264 
265                 // Simple POS
266                 String simplePos = stanfordToken.get(WNPosAnnotations.WNPosAnnotation.class);
267                 if (simplePos == null) {
268                     simplePos = "O";
269                 }
270                 thisTerm.setPos(simplePos);
271 
272                 // Features
273                 Map<String, Collection<String>> features = stanfordToken.get(CustomAnnotations.FeaturesAnnotation.class);
274                 thisTerm.setFeatures(features);
275 
276                 terms.add(thisTerm);
277                 allTerms.add(thisTerm);
278 
279                 String ne = stanfordToken.get(CoreAnnotations.NamedEntityTagAnnotation.class);
280                 if (nerMap.containsKey(ne)) {
281                     ne = nerMap.get(ne);
282                 }
283                 String normVal = stanfordToken.getString(CoreAnnotations.NormalizedNamedEntityTagAnnotation.class);
284                 if (ne != null) {
285                     if (ne.equals("O")) {
286                         ners.add("0");
287                     } else {
288 
289                         // Alternative string for SST
290                         String alt = NER2SSTtagset.tagset.get(ne);
291                         if (alt == null) {
292                             alt = "MISC";
293                         }
294 
295                         if (ne.equals(lastNER)) {
296                             entities.get(entities.size() - 1).setEndToken(i);
297                             ners.add("I-" + alt);
298                         } else {
299                             NerEntity newEntity = new NerEntity(ne, i, normVal);
300                             newEntity.setScoredLabels(stanfordToken.get(NERConfidenceAnnotator.ScoredNamedEntityTagsAnnotation.class));
301                             entities.add(newEntity);
302                             ners.add("B-" + alt);
303                         }
304                     }
305                     lastNER = ne;
306                 } else {
307                     ners.add("0");
308                 }
309 
310             }
311 
312 
313 //            @todo change next to UD??
314             for (int i = 0; i < tokens.size(); i++) {
315                 CoreLabel stanfordToken = tokens.get(i);
316 
317                 // Dependencies
318                 if (!stanfordToken.containsKey(CoreAnnotations.CoNLLDepParentIndexAnnotation.class)) {
319                     continue;
320                 }
321 
322                 int head = stanfordToken.get(CoreAnnotations.CoNLLDepParentIndexAnnotation.class);
323                 head++;
324                 String depRel = stanfordToken.get(CoreAnnotations.CoNLLDepTypeAnnotation.class);
325                 if (head != 0) {
326                     Term from = terms.get(head - 1);
327                     Term to = terms.get(i);
328                     NAFdocument.newDep(from, to, depRel);
329                 }
330 
331                 Word word = stanfordToken.get(MateAnnotations.MateTokenAnnotation.class);
332                 if (word != null) {
333                     List<Word> toRoot = Word.pathToRoot(word);
334                     for (Word w : toRoot) {
335                         int id = w.getIdx() - 1;
336                         if (id < 0) {
337                             continue;
338                         }
339                         children.get(id).add(i);
340                     }
341                 }
342             }
343 
344             // Opinion
345 
346 //            boolean includeNeutral = config.getProperty("stanford_include_neutral", "0").equals("1");
347 //
348 //            Tree sentimentTree = stanfordSentence.get(SentimentCoreAnnotations.SentimentAnnotatedTree.class);
349 //            if (sentimentTree != null) {
350 //                HashMap<edu.stanford.nlp.ling.Word, Term> indexedWords = new HashMap<>();
351 //                int wordIndex = -1;
352 //                for (Tree t : sentimentTree.getLeaves()) {
353 //                    wordIndex++;
354 //                    List<edu.stanford.nlp.ling.Word> words = t.yieldWords();
355 //                    for (edu.stanford.nlp.ling.Word w : words) {
356 //                        indexedWords.put(w, terms.get(wordIndex));
357 //                    }
358 //                }
359 //
360 //                for (Tree tree : sentimentTree) {
361 //
362 //                    Integer predictedClass;
363 //                    try {
364 //                        predictedClass = RNNCoreAnnotations.getPredictedClass(tree);
365 //                    } catch (Exception e) {
366 //                        continue;
367 //                    }
368 //
369 //                    if (predictedClass == null) {
370 //                        continue;
371 //                    }
372 //
373 //                    if (!includeNeutral && predictedClass == 2) {
374 //                        continue;
375 //                    }
376 //
377 //                    Span<Term> treeSpan = KAFDocument.newTermSpan();
378 //                    for (edu.stanford.nlp.ling.Word word : tree.yieldWords()) {
379 //                        treeSpan.addTarget(indexedWords.get(word));
380 //                    }
381 //
382 //                    Opinion opinion = NAFdocument.createOpinion();
383 //                    opinion.setLabel("stanford-sentiment");
384 //                    Opinion.OpinionExpression opinionExpression = opinion.createOpinionExpression(treeSpan);
385 //                    opinionExpression.setPolarity(stanfordSentimentLabels[predictedClass]);
386 //
387 //                    NumberFormat nf = NumberFormat.getNumberInstance();
388 //                    nf.setMaximumFractionDigits(2);
389 //
390 //                    SimpleMatrix predictions = RNNCoreAnnotations.getPredictions(tree);
391 //                    StringBuffer stringBuffer = new StringBuffer();
392 //                    stringBuffer.append(nf.format(predictions.get(0)));
393 //                    stringBuffer.append("|");
394 //                    stringBuffer.append(nf.format(predictions.get(1)));
395 //                    stringBuffer.append("|");
396 //                    stringBuffer.append(nf.format(predictions.get(2)));
397 //                    stringBuffer.append("|");
398 //                    stringBuffer.append(nf.format(predictions.get(3)));
399 //                    stringBuffer.append("|");
400 //                    stringBuffer.append(nf.format(predictions.get(4)));
401 //                    opinionExpression.setStrength(stringBuffer.toString());
402 //                }
403 //            }
404 
405             // Entities
406 
407             for (NerEntity entity : entities) {
408 
409                 int startIndex = terms.get(entity.getStartToken()).getWFs().get(0).getOffset();
410                 int endIndex = terms.get(entity.getEndToken()).getWFs()
411                         .get(terms.get(entity.getEndToken()).getWFs().size() - 1).getOffset() +
412                         terms.get(entity.getEndToken()).getWFs()
413                                 .get(terms.get(entity.getEndToken()).getWFs().size() - 1).getLength();
414 
415                 logger.debug("Stanford NER entity: " + entity + "");
416                 logger.debug(String.format("Stanford NER entity: %s (from %d to %d)", entity.getLabel(), startIndex,
417                         endIndex));
418 
419                 Span<Term> thisTermSpan = KAFDocument.newTermSpan();
420                 Span<WF> thisWFSpan = KAFDocument.newWFSpan();
421 
422                 for (int i = entity.getStartToken(); i <= entity.getEndToken(); i++) {
423                     thisTermSpan.addTarget(terms.get(i));
424                     thisWFSpan.addTargets(terms.get(i).getWFs());
425                 }
426 
427                 List<Span<Term>> thisTermList = new LinkedList<>();
428                 List<Span<WF>> thisWFList = new LinkedList<>();
429 
430                 thisTermList.add(thisTermSpan);
431                 thisWFList.add(thisWFSpan);
432 
433                 Entity thisEntity = null;
434                 Timex3 thisTimex = null;
435 
436                 entity.setLabel(entity.getLabel().toUpperCase());
437 
438                 switch (entity.getLabel()) {
439                     case "PERSON":
440                     case "LOCATION":
441                     case "ORGANIZATION":
442 
443                     case "MISC":
444                     case "MONEY":
445                     case "PERCENT":
446 
447                         // Compatibility with Tint
448                     case "PER":
449                     case "LOC":
450                     case "ORG":
451 
452                         thisEntity = NAFdocument.newEntity(thisTermList);
453                         String entityLabel = entity.getLabel().replace("PERSON","PER").replace("ORGANIZATION","ORG").replace("LOCATION","LOC");
454                         thisEntity.setType(entityLabel);
455 
456                         // Normalized value
457                         if (entity.getNormalizedValue() != null && entity.getNormalizedValue().length() > 0) {
458                             thisEntity.addExternalRef(NAFdocument.createExternalRef("value", entity.getNormalizedValue()));
459                         }
460 
461                         if (enableEntityAssignment) {
462                             LinkingTag e = null;
463                             HashSet<LinkingTag> possibleEntities = keywords.get(startIndex);
464                             if (possibleEntities != null) {
465                                 for (LinkingTag loopEntity : possibleEntities) {
466                                     int end = loopEntity.getOffset() + loopEntity.getLength();
467                                     if (end != endIndex) {
468                                         continue;
469                                     }
470                                     if (e == null || e.getScore() < loopEntity.getScore()) {
471                                         e = loopEntity;
472                                     }
473                                 }
474                             }
475 
476                             if (e != null) {
477                                 ExternalRef ext = NAFdocument.newExternalRef(e.getSource(), e.getPage());
478                                 ext.setConfidence((float) e.getScore());
479                                 thisEntity.addExternalRef(ext);
480                             }
481                         }
482 
483                         break;
484 
485                     case "NUMBER":
486                         thisEntity = NAFdocument.newEntity(thisTermList);
487                         thisEntity.setType("CARDINAL");
488                         thisEntity.addExternalRef(NAFdocument.createExternalRef("value", entity.getNormalizedValue()));
489                         break;
490 
491                     case "ORDINAL":
492                         thisEntity = NAFdocument.newEntity(thisTermList);
493                         thisEntity.setType("ORDINAL");
494                         thisEntity.addExternalRef(NAFdocument.createExternalRef("value", entity.getNormalizedValue()));
495                         break;
496 
497                     case "DATE":
498                     case "TIME":
499                         thisTimex = NAFdocument.newTimex3(thisWFSpan, entity.getLabel());
500                         thisTimex.setValue(entity.getNormalizedValue());
501                         break;
502 
503                     case "DURATION":
504                         thisTimex = NAFdocument.newTimex3(thisWFSpan, entity.getLabel());
505                         thisTimex.setValue(entity.getNormalizedValue());
506                         break;
507 
508                     default:
509                         logger.debug(entity.getLabel());
510                 }
511 
512                 if (thisEntity != null && entity.getScoredLabels() != null) {
513                     for (Entry<String, Double> entry : entity.getScoredLabels().entrySet()) {
514                         ExternalRef ref = NAFdocument.createExternalRef("value-confidence",
515                                 entry.getKey().replace("PERSON","PER").replace("ORGANIZATION","ORG").replace("LOCATION","LOC"));
516                         ref.setConfidence(entry.getValue().floatValue());
517                         thisEntity.addExternalRef(ref);
518                     }
519                 }
520             }
521 
522             for (int i = 0; i < tokens.size(); i++) {
523                 CoreLabel stanfordToken = tokens.get(i);
524 
525                 se.lth.cs.srl.corpus.Predicate predicate = stanfordToken.get(MateAnnotations.MateAnnotation.class);
526                 if (predicate != null) {
527                     Span<Term> thisTermSpan = KAFDocument.newTermSpan();
528                     Term thisTerm = terms.get(predicate.getIdx() - 1);
529                     String mateSense = predicate.getSense();
530 
531                     thisTermSpan.addTarget(thisTerm);
532 
533                     Predicate newPred = NAFdocument.newPredicate(thisTermSpan);
534                     newPred.setSource("mate");
535 
536 
537                     boolean verb =true;
538                     String sense = null;
539 
540 
541                     ExternalRef e;
542                     // If it's a verb -> PropBank, if it's a noun -> NomBank
543 //                    @todo change next to UD
544                     if (thisTerm.getPos().equals("V")) {
545                         e = NAFdocument.newExternalRef("PropBank", mateSense);
546                         e.setSource("mate");
547                         sense = mateSense;
548                     } else {
549                         verb=false;
550                         e = NAFdocument.newExternalRef("NomBank", mateSense);
551                         e.setSource("mate");
552                         // check NomBank
553                         NomBank.Roleset roleset = NomBank.getRoleset(mateSense);
554                         try {
555                             sense = roleset.getPBId();
556                         } catch (Exception ex) {
557                             logger.debug(ex.getMessage());
558                         }
559                     }
560                     newPred.addExternalRef(e);
561 
562 
563 
564                     ArrayList<String> vnClasses = new ArrayList<>();
565                     ArrayList<String> fnFrames = new ArrayList<>();
566 
567                     if (enablePM) {
568                         if (sense != null && sense.length() > 0) {
569 
570                             HashSet<String> vnToAdd = new HashSet<>();
571                             String vnFinal = null;
572 
573                             // VerbNet
574                             vnClasses = PM.getVNClasses(sense);
575                             if (!vnClasses.isEmpty()) {
576                                 if (vnClasses.size() == 1 || !enableOntoNotesFilter) {
577                                     for (String vnClass1 : vnClasses) {
578                                         vnToAdd.add(vnClass1);
579                                         vnFinal = vnClass1;
580                                     }
581                                 } else {
582                                     Integer value = 0;
583 
584                                     for (String vnClass : vnClasses) {
585                                         Integer thisValue = statisticsExtractor.getVnTotals().get(vnClass);
586                                         thisValue = thisValue == null ? 0 : thisValue;
587                                         if (thisValue >= value) {
588                                             vnFinal = vnClass;
589                                             value = thisValue;
590                                         }
591                                     }
592 
593                                     // Reset the list of classes
594                                     vnClasses = new ArrayList<>();
595 
596                                     if (vnFinal != null) {
597                                         vnToAdd.add(vnFinal);
598                                         vnClasses.add(vnFinal);
599                                     }
600                                 }
601                             }
602                             //check if the sense maps to a subclass of a selected class
603                             ArrayList<String> vnSubClasses = PM.getVNSubClasses(sense);
604                             if (!vnSubClasses.isEmpty()) {
605                                 for (String vnSubClass1 : vnSubClasses) {
606                                     for (String vnClass : vnClasses) {
607                                         if (!vnSubClass1.startsWith(vnClass)) {
608                                             continue;
609                                         }
610 
611                                         vnToAdd.add(vnSubClass1);
612 
613                                         // Remove upper class
614                                         if (vnFinal != null) {
615                                             if (vnSubClass1.startsWith(vnFinal)) {
616                                                 vnToAdd.remove(vnFinal);
617                                             }
618                                         }
619                                     }
620                                 }
621                             }
622 
623                             for (String vnClass1 : vnToAdd) {
624                                 ExternalRef vnClass = NAFdocument.newExternalRef("VerbNet", vnClass1);
625                                 vnClass.setSource("mate+pm");
626                                 newPred.addExternalRef(vnClass);
627                             }
628 
629                             //added to make consistents thematic roles and vnclasses
630                             vnClasses.clear();
631                             vnClasses.addAll(vnToAdd);
632 
633                             // FrameNet
634                             fnFrames = PM.getFNFrames(sense);
635 
636                             if (enableOntoNotesFilter) {
637                                 HashSet<String> possibleFrames = new HashSet<>();
638                                 for (String vnClass : vnClasses) {
639                                     possibleFrames.addAll(PM.getVNClassesToFN(vnClass));
640                                 }
641 
642 //								System.out.println("vnClasses: " + vnClasses);
643 //								System.out.println("fnFrames (before): " + fnFrames);
644                                 fnFrames.retainAll(possibleFrames);
645 //								System.out.println("fnFrames (after): " + fnFrames);
646 //								System.out.println("Possible frames: " + possibleFrames);
647                             }
648 
649                             if (!fnFrames.isEmpty()) {
650                                 if (fnFrames.size() == 1 || !enableOntoNotesFilter) {
651                                     for (String fnFrame1 : fnFrames) {
652                                         ExternalRef fnFrame = NAFdocument.newExternalRef("FrameNet", fnFrame1);
653                                         fnFrame.setSource("mate+pm");
654                                         newPred.addExternalRef(fnFrame);
655                                     }
656                                 } else {
657                                     Integer value = 0;
658                                     String fnFinal = null;
659 
660                                     for (String fnFrame : fnFrames) {
661                                         Integer thisValue = statisticsExtractor.getFnTotals()
662                                                 .get(fnFrame.toLowerCase());
663                                         thisValue = thisValue == null ? 0 : thisValue;
664                                         if (thisValue >= value) {
665                                             fnFinal = fnFrame;
666                                             value = thisValue;
667                                         }
668                                     }
669 
670                                     // Reset the list of frames
671                                     fnFrames = new ArrayList<>();
672 
673                                     if (fnFinal != null) {
674                                         ExternalRef fnFrame = NAFdocument.newExternalRef("FrameNet", fnFinal);
675                                         fnFrame.setSource("mate+pm");
676                                         newPred.addExternalRef(fnFrame);
677                                         fnFrames.add(fnFinal);
678                                     }
679                                 }
680                             }
681 
682                             if (!verb){
683                                 // PropBank
684                                 ArrayList<String> pbPredicates = PM.getPBPredicates(sense);
685                                 if (!pbPredicates.isEmpty()) {
686                                     for (String pbPredicate1 : pbPredicates) {
687                                         ExternalRef pbPredicate = NAFdocument.newExternalRef("PropBank", pbPredicate1);
688                                         pbPredicate.setSource("mate+nb");
689                                         newPred.addExternalRef(pbPredicate);
690                                     }
691                                 }
692                             }
693 
694                             // ESO
695                             ArrayList<String> esoClasses = PM.getESOClasses(sense);
696                             if (!esoClasses.isEmpty()) {
697                                 for (String esoClass1 : esoClasses) {
698                                     ExternalRef esoClass = NAFdocument.newExternalRef("ESO", esoClass1);
699                                     esoClass.setSource("mate+pm");
700                                     newPred.addExternalRef(esoClass);
701                                 }
702                             }
703 //                            Not in pm1.3
704 //                            // EventType
705 //                            ArrayList<String> eventTypes = PM.getEventTypes(sense);
706 //                            if (!eventTypes.isEmpty()) {
707 //                                for (String eventType1 : eventTypes) {
708 //                                    ExternalRef eventType = NAFdocument.newExternalRef("EventType", eventType1);
709 //                                    eventType.setSource("mate+pm");
710 //                                    newPred.addExternalRef(eventType);
711 //                                }
712 //                            }
713 
714                             // WordNet
715                             ArrayList<String> wnSenses = PM.getWNSenses(sense);
716                             if (!wnSenses.isEmpty()) {
717                                 for (String wnSense1 : wnSenses) {
718                                     ExternalRef wnSense = NAFdocument.newExternalRef("WordNet", wnSense1);
719                                     wnSense.setSource("mate+pm");
720                                     newPred.addExternalRef(wnSense);
721                                 }
722                             }
723 
724                         }
725                     }
726 
727                     for (Word w : predicate.getArgMap().keySet()) {
728                         Span<Term> thisTermSpanForRole = KAFDocument.newTermSpan();
729                         for (int k : children.get(w.getIdx() - 1)) {
730                             thisTermSpanForRole.addTarget(terms.get(k));
731                         }
732                         thisTermSpanForRole.setHead(terms.get(w.getIdx() - 1));
733 
734                         String argument = predicate.getArgMap().get(w);
735                         Predicate.Role newRole = NAFdocument.newRole(newPred, argument, thisTermSpanForRole);
736                         ExternalRef mateRoleRef;
737                         if(verb){
738 
739                             mateRoleRef = NAFdocument
740                                     .newExternalRef("PropBank", mateSense+"@"+argument);
741                             mateRoleRef.setSource("mate");
742 
743 
744                         } else {
745 
746                             mateRoleRef = NAFdocument
747                                     .newExternalRef("NomBank", mateSense+"@"+argument);
748                             mateRoleRef.setSource("mate");
749                         }
750 
751                         newRole.addExternalRef(mateRoleRef);
752 
753 
754                         if (enablePM && PM != null && statisticsExtractor != null) {
755 
756                             // VerbNet
757                             ArrayList<String> vnThematicRoles = PM.getVNThematicRoles(sense + "@" + argument);
758                             if (!vnThematicRoles.isEmpty()) {
759                                 for (String vnThematicRole1 : vnThematicRoles) {
760                                     if (!enableOntoNotesFilter) {
761                                         ExternalRef vnThematicRole = NAFdocument
762                                                 .newExternalRef("VerbNet", vnThematicRole1);
763                                         vnThematicRole.setSource("mate+pm");
764                                         newRole.addExternalRef(vnThematicRole);
765                                     } else {
766                                         String[] parts = vnThematicRole1.split("@");
767                                         if (vnClasses.contains(parts[0])) {
768                                             ExternalRef vnThematicRole = NAFdocument
769                                                     .newExternalRef("VerbNet", vnThematicRole1);
770                                             vnThematicRole.setSource("mate+pm");
771                                             newRole.addExternalRef(vnThematicRole);
772                                         }
773                                     }
774                                 }
775                             }
776 
777                             // FrameNet
778                             ArrayList<String> fnFrameElements = PM.getFNFrameElements(sense + "@" + argument);
779                             if (!fnFrameElements.isEmpty()) {
780                                 for (String fnFrameElement1 : fnFrameElements) {
781                                     if (!enableOntoNotesFilter) {
782                                         ExternalRef fnFrameElement = NAFdocument
783                                                 .newExternalRef("FrameNet", fnFrameElement1);
784                                         fnFrameElement.setSource("mate+pm");
785                                         newRole.addExternalRef(fnFrameElement);
786                                     } else {
787                                         String[] parts = fnFrameElement1.split("@");
788                                         if (fnFrames.contains(parts[0])) {
789                                             ExternalRef fnFrameElement = NAFdocument
790                                                     .newExternalRef("FrameNet", fnFrameElement1);
791                                             fnFrameElement.setSource("mate+pm");
792                                             newRole.addExternalRef(fnFrameElement);
793                                         }
794                                     }
795                                 }
796                             }
797 
798                             // PropBank
799                             if (!verb) {
800                                 ArrayList<String> pbArguments = PM.getPBArguments(sense + "@" + argument);
801                                 if (!pbArguments.isEmpty()) {
802                                     for (String pbArgument1 : pbArguments) {
803                                         ExternalRef pbArgument = NAFdocument.newExternalRef("PropBank", pbArgument1);
804                                         pbArgument.setSource("mate+pm");
805                                         newRole.addExternalRef(pbArgument);
806                                     }
807                                 }
808                             }
809                             // ESO
810                             ArrayList<String> esoRoles = PM.getESORoles(sense + "@" + argument);
811                             if (!esoRoles.isEmpty()) {
812                                 for (String esoRole1 : esoRoles) {
813                                     ExternalRef esoRole = NAFdocument.newExternalRef("ESO", esoRole1);
814                                     esoRole.setSource("mate+pm");
815                                     newRole.addExternalRef(esoRole);
816                                 }
817                             }
818                         }
819 
820                         newPred.addRole(newRole);
821                     }
822 
823                 }
824             }
825 
826             if (stanfordSentence.containsKey(SemaforAnnotations.SemaforAnnotation.class)) {
827                 SemaforParseResult semaforParseResult = stanfordSentence.get(SemaforAnnotations.SemaforAnnotation.class);
828                 ObjectMapper mapper = new ObjectMapper();
829 
830                 mapper.disable(DeserializationFeature.FAIL_ON_UNKNOWN_PROPERTIES);
831                 Semafor.SemaforResponse semaforResponse = mapper
832                         .readValue(semaforParseResult.toJson(), Semafor.SemaforResponse.class);
833                 for (Semafor.SemaforFrame semaforFrame : semaforResponse.getFrames()) {
834                     Semafor.SemaforAnnotation semaforTarget = semaforFrame.getTarget();
835                     if (semaforTarget == null) {
836                         continue;
837                     }
838                     String frameName = semaforTarget.getName();
839 
840                     if (semaforTarget.getSpans().size() == 0) {
841                         continue;
842                     }
843                     if (semaforFrame.getAnnotationSets().size() == 0) {
844                         continue;
845                     }
846 
847                     Semafor.SemaforSpan semaforSpan = semaforTarget.getSpans().get(0);
848                     Semafor.SemaforSet semaforAnnotation = semaforFrame.getAnnotationSets().get(0);
849 
850                     Span<Term> termSpan = KAFDocument.newTermSpan();
851                     for (int i = semaforSpan.getStart(); i < semaforSpan.getEnd(); i++) {
852                         termSpan.addTarget(terms.get(i));
853                     }
854 
855                     if (termSpan.size() == 0) {
856                         continue;
857                     }
858 
859                     Predicate predicate = NAFdocument.newPredicate(termSpan);
860                     predicate.setSource("semafor");
861                     predicate.setConfidence(semaforAnnotation.getScore());
862                     ExternalRef frameNameExt = NAFdocument.createExternalRef("FrameNet", frameName);
863                     frameNameExt.setSource("semafor");
864                     predicate.addExternalRef(frameNameExt);
865 
866                     predicate.setId("f_" + predicate.getId());
867 
868                     for (Semafor.SemaforAnnotation frameAnnotation : semaforAnnotation.getFrameElements()) {
869                         Semafor.SemaforSpan roleSpan = frameAnnotation.getSpans().get(0);
870                         String roleName = frameAnnotation.getName();
871 
872                         Span<Term> roleTermSpan = KAFDocument.newTermSpan();
873                         for (int i = roleSpan.getStart(); i < roleSpan.getEnd(); i++) {
874                             roleTermSpan.addTarget(terms.get(i));
875                         }
876 
877                         if (roleTermSpan.size() == 0) {
878                             continue;
879                         }
880 
881                         Predicate.Role role = NAFdocument.newRole(predicate, "", roleTermSpan);
882 
883 //                        @todo change next to UD
884                         final Term head = NAFUtils.extractHead(NAFdocument, role.getSpan());
885                         if (head != null) {
886                             final Span<Term> newSpan = KAFDocument
887                                     .newTermSpan(Ordering.from(Term.OFFSET_COMPARATOR).sortedCopy(
888                                             NAFdocument.getTermsByDepAncestors(ImmutableList.of(head))));
889                             role.setSpan(newSpan);
890                         }
891                         ExternalRef roleNameExt = NAFdocument.createExternalRef("FrameNet", frameName + "@" + roleName);
892                         roleNameExt.setSource("semafor");
893                         role.addExternalRef(roleNameExt);
894 //                        predicate.setSource("semafor");
895                         predicate.addRole(role);
896                     }
897 
898                 }
899             }
900 
901             // Constituency: do we need it?
902             Tree tree = stanfordSentence.get(TreeCoreAnnotations.TreeAnnotation.class);
903             if (tree != null) {
904                 NAFdocument.addConstituencyString(tree.toString(), sentIndex + 1);
905                 try {
906                     logger.debug("Tree: " + tree.toString());
907 //                    @todo change next to UD
908                     addHeads(tree);
909                     NAFdocument.addConstituencyFromParentheses(tree.toString(), sentIndex + 1);
910                 } catch (Exception e) {
911                     logger.info("Tree: " + tree.toString());
912                     logger.warn(e.getMessage());
913                     e.printStackTrace();
914                 }
915             }
916 
917         } // end sentences loop
918 
919         // Entities
920         for (Integer startIndex : keywords.keySet()) {
921             for (LinkingTag e : keywords.get(startIndex)) {
922                 int end = e.getOffset() + e.getLength();
923                 Integer startToken = tokenFromStart.get(e.getOffset());
924                 Integer endToken = tokenFromEnd.get(end);
925                 Span<WF> span = KAFDocument.newWFSpan();
926                 if (startToken != null && endToken != null) {
927                     for (int j = startToken; j <= endToken; j++) {
928                         span.addTarget(allTokens.get(j));
929                     }
930 
931                     try {
932                         LinkedEntity linkedEntity = NAFdocument.newLinkedEntity(span);
933                         linkedEntity.setConfidence(e.getScore());
934                         linkedEntity.setReference(e.getPage());
935                         linkedEntity.setResource(e.getSource());
936                         linkedEntity.setTypes(e.getStringTypes());
937                         linkedEntity.setSpotted(e.isSpotted());
938                     } catch (Exception err) {
939                         logger.error("Error on adding linkedEntity: " + err.getMessage());
940                     }
941                 }
942             }
943         }
944 
945         // Coref
946 
947         // Simple coref
948         HashMultimap<Integer, Integer> simpleCoref = document.get(CustomAnnotations.SimpleCorefAnnotation.class);
949         if (simpleCoref != null) {
950             List<Span<Term>> mentions = new ArrayList<>();
951 
952             for (Integer sentenceID : simpleCoref.keySet()) {
953                 TreeSet<Integer> sortedSet = new TreeSet<>();
954                 sortedSet.addAll(simpleCoref.get(sentenceID));
955 
956                 Span<Term> thisTermSpan = KAFDocument.newTermSpan();
957                 int lastTokenID = -1;
958 
959                 for (Integer tokenID : sortedSet) {
960                     tokenID = tokenID - 1;
961                     int sentenceStartTokenIndex = sentIndexes.get(sentenceID);
962                     int id = sentenceStartTokenIndex + tokenID;
963                     if (tokenID - lastTokenID > 1) {
964                         if (thisTermSpan.size() > 0) {
965                             mentions.add(thisTermSpan);
966                         }
967                         thisTermSpan = KAFDocument.newTermSpan();
968                     }
969                     thisTermSpan.addTarget(allTerms.get(id));
970                     lastTokenID = tokenID;
971                 }
972                 if (thisTermSpan.size() > 0) {
973                     mentions.add(thisTermSpan);
974                 }
975             }
976 
977             if (mentions.size() > 0) {
978                 NAFdocument.newCoref(mentions);
979             }
980         }
981 
982         // Loop through clusters
983         if (coreferenceGraph != null) {
984             for (Object c : coreferenceGraph.keySet()) {
985 
986                 CorefChain chain = coreferenceGraph.get(c);
987                 Map<IntPair, Set<CorefChain.CorefMention>> mentionMap = chain.getMentionMap();
988 
989                 List<Span<Term>> mentions = new ArrayList<>();
990 
991                 // Loop through sentences
992                 for (IntPair p : mentionMap.keySet()) {
993 
994                     Set<CorefChain.CorefMention> corefMentions = mentionMap.get(p);
995                     if (corefMentions.size() < 2) {
996                         continue;
997                     }
998 
999                     // Loop through mentions
1000                     for (CorefChain.CorefMention m : corefMentions) {
1001 
1002                         int sentenceStartTokenIndex = sentIndexes.get(m.sentNum - 1);
1003                         int start = sentenceStartTokenIndex + m.startIndex - 1;
1004 
1005                         Span<Term> thisTermSpan = KAFDocument.newTermSpan();
1006                         for (int i = start; i < start + m.endIndex - m.startIndex; i++) {
1007                             thisTermSpan.addTarget(allTerms.get(i));
1008                         }
1009 
1010                         if (thisTermSpan.size() > 0) {
1011                             mentions.add(thisTermSpan);
1012                         }
1013                     }
1014                 }
1015 
1016                 if (mentions.size() > 0) {
1017                     NAFdocument.newCoref(mentions);
1018                 }
1019             }
1020         }
1021 
1022         // NAF filter
1023         if (enableNafFilter) {
1024             logger.info("Applying NAF filter");
1025             Properties nafFilterConfig = PropertiesUtils.dotConvertedProperties(properties, "filter");
1026 
1027             LinguisticProcessor linguisticProcessor = new LinguisticProcessor("naf-filter", "NAF filter");
1028             linguisticProcessor.setBeginTimestamp();
1029             try {
1030                 NAFFilter filter = NAFFilter.builder().withProperties(properties, "filter").build();
1031                 filter.filter(NAFdocument);
1032 
1033                 //NAFFilter.builder().withProperties(properties,"filter").build().filter(NAFdocument);
1034 //                NAFFilter.builder().build().filter(NAFdocument);
1035 //                NAFFilter.builder(false)
1036 //                        .withTermSenseCompletion(true).withSRLRoleLinking(false, false)
1037 //                        .withOpinionLinking(false, false).build()
1038 //                        .filter(NAFdocument);
1039             } catch (Exception e) {
1040                 logger.error("Error applying NAF filter");
1041             }
1042             linguisticProcessor.setEndTimestamp();
1043             NAFdocument.addLinguisticProcessor(linguisticProcessor.getLayer(), linguisticProcessor);
1044         }
1045     }
1046 
1047     private KAFDocument parseAll(KAFDocument NAFdocument) throws Exception {
1048         return parseAll(NAFdocument, new Properties());
1049     }
1050 
1051     private KAFDocument parseAll(KAFDocument NAFdocument, Properties merge) throws Exception {
1052 
1053         String text = NAFdocument.getRawText();
1054         text = StringEscapeUtils.unescapeHtml(text);
1055 
1056         Properties properties = getDefaultConfig();
1057         properties.putAll(merge);
1058 
1059         String maxTextLen = properties.getProperty("max_text_len");
1060         int limit = Integer.parseInt(maxTextLen);
1061         if (text.length() > limit) {
1062             throw new Exception(String.format("Input too long (%d chars, limit is %d)", text.length(), limit));
1063         }
1064 
1065         loadModels(properties);
1066         Properties stanfordConfig = PropertiesUtils.dotConvertedProperties(properties, "stanford");
1067 
1068         // Load pipeline
1069         Properties thisSessionProps = new Properties(stanfordConfig);
1070         StanfordCoreNLP thisPipeline = new StanfordCoreNLP(thisSessionProps);
1071 
1072         // Stanford
1073         logger.info("Annotating with Stanford CoreNLP");
1074         LinguisticProcessor linguisticProcessor = new LinguisticProcessor("text", "Stanford CoreNLP");
1075         linguisticProcessor.setBeginTimestamp();
1076         Annotation document = new Annotation(text);
1077         document.set(CoreAnnotations.DocDateAnnotation.class, NAFdocument.getFileDesc().creationtime);
1078         if (NAFdocument.getFileDesc().title != null) {
1079             document.set(CoreAnnotations.DocTitleAnnotation.class, NAFdocument.getFileDesc().title);
1080         }
1081         thisPipeline.annotate(document);
1082         logger.info(thisPipeline.timingInformation());
1083         linguisticProcessor.setEndTimestamp();
1084         NAFdocument.addLinguisticProcessor(linguisticProcessor.getLayer(), linguisticProcessor);
1085 
1086         annotateStanford(properties, document, NAFdocument);
1087 
1088         logger.info("Parsing finished");
1089         return NAFdocument;
1090     }
1091 
1092     public KAFDocument parseFromNAF(KAFDocument NAFdocument) throws Exception {
1093 
1094         NAFdocument = parseAll(NAFdocument);
1095 
1096         return NAFdocument;
1097     }
1098 
1099     public KAFDocument parseFromString(String textInNafFormat) throws Exception {
1100         logger.debug("Parsing of NAF");
1101 
1102         InputStream is = new ByteArrayInputStream(textInNafFormat.getBytes());
1103         BufferedReader br = new BufferedReader(new InputStreamReader(is));
1104         KAFDocument NAFdocument = KAFDocument.createFromStream(br);
1105 
1106         try {
1107             logger.info("Document: " + NAFdocument.getFileDesc().filename);
1108             logger.info("Title: " + NAFdocument.getFileDesc().title);
1109 //            logger.debug("Text: " + NAFdocument.getRawText());
1110         } catch (Exception e) {
1111             logger.error(e.getMessage());
1112         }
1113 
1114         NAFdocument = parseAll(NAFdocument);
1115 
1116         return NAFdocument;
1117     }
1118 
1119 }