1   package eu.fbk.dkm.pikes.resources;
2   
3   import java.io.BufferedReader;
4   import java.io.IOException;
5   import java.io.Writer;
6   import java.nio.file.Files;
7   import java.nio.file.Path;
8   import java.util.Collection;
9   import java.util.Collections;
10  import java.util.Iterator;
11  import java.util.List;
12  import java.util.Map;
13  import java.util.Set;
14  import java.util.regex.Pattern;
15  
16  import javax.annotation.Nullable;
17  
18  import com.google.common.base.Charsets;
19  import com.google.common.collect.HashMultimap;
20  import com.google.common.collect.ImmutableList;
21  import com.google.common.collect.ImmutableSet;
22  import com.google.common.collect.Iterables;
23  import com.google.common.collect.Lists;
24  import com.google.common.collect.Maps;
25  import com.google.common.collect.Multimap;
26  import com.google.common.collect.Ordering;
27  import com.google.common.collect.Sets;
28  
29  import eu.fbk.rdfpro.util.Statements;
30  import eu.fbk.utils.core.Range;
31  import ixa.kaflib.Coref;
32  import ixa.kaflib.Dep;
33  import ixa.kaflib.Entity;
34  import ixa.kaflib.ExternalRef;
35  import ixa.kaflib.Factuality;
36  import ixa.kaflib.KAFDocument;
37  import ixa.kaflib.Opinion;
38  import ixa.kaflib.Opinion.OpinionExpression;
39  import ixa.kaflib.Opinion.OpinionHolder;
40  import ixa.kaflib.Opinion.OpinionTarget;
41  import ixa.kaflib.Predicate;
42  import ixa.kaflib.Predicate.Role;
43  import ixa.kaflib.Span;
44  import ixa.kaflib.Term;
45  import ixa.kaflib.Timex3;
46  import ixa.kaflib.WF;
47  
48  import eu.fbk.rdfpro.util.IO;
49  import org.eclipse.rdf4j.model.IRI;
50  
51  public final class NAFUtils {
52  
53      public static final String RESOURCE_PROPBANK = "PropBank";
54  
55      public static final String RESOURCE_NOMBANK = "NomBank";
56  
57      public static final String RESOURCE_VERBNET = "VerbNet";
58  
59      public static final String RESOURCE_FRAMENET = "FrameNet";
60  
61      public static final String RESOURCE_BBN = "BBN";
62  
63      public static final String RESOURCE_WN_SYNSET = "wn30-ukb";
64  
65      public static final String RESOURCE_WN_SST = "wn30-sst";
66  
67      public static final String RESOURCE_SUMO = "SUMO";
68  
69      public static final String RESOURCE_ENTITY_REF = "NAFFilter-EntityRef";
70  
71      public static final String RESOURCE_ENTITY_COREF = "NAFFilter-EntityCoref";
72  
73      public static final String RESOURCE_PREDICATE_REF = "NAFFilter-PredicateRef";
74  
75      public static final String RESOURCE_PREDICATE_COREF = "NAFFilter-PredicateCoref";
76  
77      public static final String RESOURCE_TIMEX_REF = "NAFFilter-TimexRef";
78  
79      public static final String RESOURCE_TIMEX_COREF = "NAFFilter-TimexCoref";
80  
81      public static final String RESOURCE_VALUE = "value";
82  
83      public static final String RESOURCE_YAGO = "Yago";
84  
85      public static final String PREMON_NAMESPACE = "http://premon.fbk.eu/resource/";
86      public static final String PREMON_FNPREFIX = "fn15";
87      public static final String PREMON_VNPREFIX = "vb32";
88      public static final String PREMON_PBPREFIX = "pb17";
89      public static final String PREMON_NBPREFIX = "nb10";
90      public static final String PREMON_ARGUMENT_SEPARATOR = "@";
91      public static final String PREMON_RESOURCE_PROPBANK = "PreMOn+PropBank";
92  
93      public static final String PREMON_RESOURCE_NOMBANK = "PreMOn+NomBank";
94  
95      public static final String PREMON_RESOURCE_VERBNET = "PreMOn+VerbNet";
96  
97      public static final String PREMON_RESOURCE_FRAMENET = "PreMOn+FrameNet";
98  
99      public static final Ordering<Opinion> OPINION_COMPARATOR = new Ordering<Opinion>() {
100 
101         @Override
102         public int compare(final Opinion left, final Opinion right) {
103             final int leftOffset = left.getOpinionExpression().getSpan().getTargets().get(0)
104                     .getOffset();
105             final int rightOffset = right.getOpinionExpression().getSpan().getTargets().get(0)
106                     .getOffset();
107             return leftOffset - rightOffset;
108         }
109 
110     };
111 
112     private static final Pattern WF_EXCLUSION_PATTERN = Pattern.compile("[^A-Za-z0-9]*");
113 
114     private static final Set<String> SYMBOLS = ImmutableSet.of("$", "#", "&", "€");
115 
116     public static void normalize(final KAFDocument document) {
117 
118         // Convert SST, synset and BBN attributes to external refs
119         for (final Term term : document.getTerms()) {
120             boolean hasBBN = false;
121             boolean hasSynset = false;
122             boolean hasSST = false;
123             for (final ExternalRef ref : term.getExternalRefs()) {
124                 hasBBN |= RESOURCE_BBN.equalsIgnoreCase(ref.getResource());
125                 hasSynset |= RESOURCE_WN_SYNSET.equalsIgnoreCase(ref.getResource());
126                 hasSST |= RESOURCE_WN_SST.equalsIgnoreCase(ref.getResource());
127             }
128             if (!hasBBN && term.getBBNTag() != null) {
129                 term.addExternalRef(document.newExternalRef(RESOURCE_BBN, term.getBBNTag()));
130             }
131             if (!hasSynset && term.getWordnetSense() != null) {
132                 term.addExternalRef(document.newExternalRef(RESOURCE_WN_SYNSET,
133                         term.getWordnetSense()));
134             }
135             if (!hasSST && term.getSupersenseTag() != null) {
136                 term.addExternalRef(document.newExternalRef(RESOURCE_WN_SST,
137                         term.getSupersenseTag()));
138             }
139             term.setBBNTag(null);
140             term.setWordnetSense(null);
141             term.setSupersenseTag(null);
142         }
143 
144         // Remove duplicate external refs
145         for (final Predicate predicate : document.getPredicates()) {
146             normalizeRefs(getRefs(predicate));
147             for (final Role role : predicate.getRoles()) {
148                 normalizeRefs(getRefs(role));
149             }
150         }
151     }
152 
153     public static List<Term> filterTerms(final Iterable<Term> terms) {
154         final List<Term> result = Lists.newArrayList();
155         boolean atBeginning = true;
156         for (final Term term : Ordering.from(Term.OFFSET_COMPARATOR).sortedCopy(terms)) {
157             final char pos = Character.toUpperCase(term.getPos().charAt(0));
158             if (atBeginning && (pos == 'D' || pos == 'P')) {
159                 continue;
160             }
161             for (final WF word : term.getWFs()) {
162                 final String text = word.getForm();
163                 if (SYMBOLS.contains(text) || !WF_EXCLUSION_PATTERN.matcher(text).matches()) {
164                     result.add(term);
165                     atBeginning = false;
166                     break;
167                 }
168             }
169         }
170         return result;
171     }
172 
173     public static String getText(final Iterable<Term> terms) {
174         final StringBuilder builder = new StringBuilder();
175         boolean atBeginning = true;
176         for (final Term term : Ordering.from(Term.OFFSET_COMPARATOR).sortedCopy(terms)) {
177             final boolean properNoun = term.getMorphofeat().startsWith("NNP");
178             for (final WF word : term.getWFs()) {
179                 builder.append(atBeginning ? "" : " ");
180                 builder.append(properNoun ? word.getForm() : word.getForm().toLowerCase());
181                 atBeginning = false;
182             }
183         }
184         return builder.toString();
185     }
186 
187     @Nullable
188 
189     public static Term extractHead(final KAFDocument document, @Nullable final Span<Term> span) {
190         if (span == null) {
191             return null;
192         }
193         Term head = null; // span.getHead(); TODO
194         if (head == null) {
195 
196             head = document.getTermsHead(span.getTargets()); // (re)compute
197         }
198         return head;
199     }
200 
201     public static Set<Term> extractHeads(final KAFDocument document,
202             @Nullable final Iterable<Term> ancestors, @Nullable final Iterable<Term> span,
203             @Nullable final java.util.function.Predicate<Term> predicate) {
204 
205         Set<Term> ancestorSet;
206         if (ancestors != null) {
207             ancestorSet = ImmutableSet.copyOf(ancestors);
208         } else {
209             ancestorSet = Sets.newHashSet();
210             final Set<Term> termSet = Sets.newHashSet(span);
211             for (final Term term : termSet) {
212                 final Dep dep = document.getDepToTerm(term);
213                 if (dep == null || !termSet.contains(dep.getFrom())) {
214                     ancestorSet.add(term);
215                 }
216             }
217         }
218 
219         final Set<Term> result = Sets.newHashSet();
220         for (final Term ancestor : ancestorSet) {
221             extractHeadsHelper(document, ancestor, predicate, result);
222         }
223         if (span != null) {
224             result.retainAll(ImmutableSet.copyOf(span));
225         }
226         // System.err.println(document.getPublic().uri + " -> " + termFilter + " / " + ancestors
227         // + " -> " + result);
228         return result;
229     }
230 
231 
232     //todo adapt POS and DEP (UD)
233     private static boolean extractHeadsHelper(final KAFDocument document, final Term term,
234             final java.util.function.Predicate<Term> predicate, final Collection<Term> result) {
235         final String pos = extendedPos(document, term);
236         boolean accepted = false;
237         if (pos.startsWith("V")) {
238             final Term srlHead = syntacticToSRLHead(document, term);
239             if (!term.equals(srlHead)) {
240                 accepted = extractHeadsHelper(document, srlHead, predicate, result);
241             }
242         }
243         if (!accepted && (predicate == null || predicate.test(term))) {
244             result.add(term);
245             accepted = true;
246         }
247         if (accepted) {
248             for (final Dep dep : document.getDepsFromTerm(term)) {
249                 if (dep.getRfunc().toUpperCase().contains("COORD")) {
250                     extractHeadsHelper(document, dep.getTo(), predicate, result);
251                 }
252             }
253         } else {
254             for (final Dep dep : document.getDepsFromTerm(term)) {
255                 extractHeadsHelper(document, dep.getTo(), predicate, result);
256             }
257         }
258         return accepted;
259     }
260 
261     public static boolean hasHead(final KAFDocument document, final Object annotation,
262             final Term head) {
263         List<Span<Term>> spans;
264         if (annotation instanceof Coref) {
265             spans = ((Coref) annotation).getSpans();
266         } else if (annotation instanceof Entity) {
267             spans = ((Entity) annotation).getSpans();
268         } else if (annotation instanceof Timex3) {
269             spans = ImmutableList.of(KAFDocument.newTermSpan(document
270                     .getTermsByWFs(((Timex3) annotation).getSpan().getTargets())));
271         } else if (annotation instanceof Predicate) {
272             spans = ImmutableList.of(((Predicate) annotation).getSpan());
273         } else if (annotation instanceof Role) {
274             spans = ImmutableList.of(((Role) annotation).getSpan());
275         } else {
276             throw new IllegalArgumentException("Unsupported annotation: " + annotation);
277         }
278         for (final Span<Term> span : spans) {
279             if (head == extractHead(document, span)) {
280                 return true;
281             }
282         }
283         return false;
284     }
285 
286     public static Span<Term> getNominalSpan(final KAFDocument document, final Term term,
287             final boolean includeCoord, final boolean includeModifiers) {
288 
289         // Start from the supplied term
290         final Set<Term> terms = Sets.newHashSet(term);
291 
292         // Identify head and terms of all NE and TIMEX markables containing supplied term
293         final Map<Term, List<Term>> markables = Maps.newHashMap();
294         for (final Entity entity : document.getEntitiesByTerm(term)) {
295             markables.put(document.getTermsHead(entity.getTerms()), entity.getTerms());
296         }
297         for (final WF wf : term.getWFs()) {
298             for (final Timex3 timex : document.getTimeExsByWF(wf)) {
299                 final List<Term> span = document.getTermsByWFs(timex.getSpan().getTargets());
300                 markables.put(document.getTermsHead(span), span);
301             }
302         }
303 
304         // Add the terms of the smallest markable 'matching' the term (i.e., whose head matches
305         // the term or a term ancestor in the dependency tree)
306         if (!markables.isEmpty()) {
307             Term t = term;
308             while (true) {
309                 final List<Term> parent = markables.get(t);
310                 if (parent != null) {
311                     terms.addAll(parent);
312                     break;
313                 }
314                 final Dep dep = document.getDepToTerm(t);
315                 if (dep == null) {
316                     break;
317                 }
318                 t = dep.getFrom();
319             }
320         }
321 
322         // Identify head
323         final Term head = document.getTermsHead(terms);
324 
325         // Add all terms reachable from the head using a regex
326         final String regex = includeCoord ? includeModifiers ? "(COORD CONJ?)* ((NAME|NMOD|AMOD|TMP) .*)?"
327                 : "(COORD CONJ?)* NAME"
328                 : includeModifiers ? "((NAME|NMOD|AMOD|TMP) .*)?" : "NAME";
329         terms.addAll(document.getTermsByDepAncestors(Collections.singleton(head), regex));
330 
331         // Sort obtained terms by offset and return resulting list
332         return KAFDocument.newTermSpan(Ordering.from(Term.OFFSET_COMPARATOR).sortedCopy(terms),
333                 head);
334     }
335 
336     @Nullable
337     public static String extractLemma(@Nullable final String rolesetOrRole) {
338         if (rolesetOrRole == null) {
339             return null;
340         }
341         int index = rolesetOrRole.indexOf('.');
342         if (index < 0) {
343             index = rolesetOrRole.indexOf('@');
344         }
345         return (index >= 0 ? rolesetOrRole.substring(0, index) : rolesetOrRole).toLowerCase();
346     }
347 
348     @Nullable
349     public static Integer extractSense(@Nullable final String rolesetOrRole) {
350         if (rolesetOrRole == null) {
351             return null;
352         }
353         final int start = Math.max(0, rolesetOrRole.indexOf('.') + 1);
354         int end = rolesetOrRole.indexOf('@');
355         end = end > 0 ? end : rolesetOrRole.length();
356         try {
357             return Integer.valueOf(rolesetOrRole.substring(start, end));
358         } catch (final Throwable ex) {
359             return null;
360         }
361     }
362 
363     @Nullable
364     public static Integer extractArgNum(@Nullable final String role) {
365         if (role == null) {
366             return null;
367         }
368         int index = role.length();
369         while (index > 0 && Character.isDigit(role.charAt(index - 1))) {
370             --index;
371         }
372         return index == role.length() ? null : Integer.valueOf(role.substring(index));
373     }
374 
375     // OFFSETS
376 
377     public static int getBegin(final Term term) {
378         return term.getOffset();
379     }
380 
381     public static int getEnd(final Term term) {
382         final List<WF> wfs = term.getWFs();
383         final WF wf = wfs.get(wfs.size() - 1);
384         final String str = wf.getForm();
385         if (str.equals("-LSB-") || str.equals("-RSB-") || str.equals("''")) {
386             return wf.getOffset() + 1;
387         }
388         return wf.getOffset() + wf.getLength();
389     }
390 
391     public static int getLength(final Term term) {
392         return getEnd(term) - term.getOffset();
393     }
394 
395     public static String getRoleset(final Predicate predicate) {
396         final String res = predicate.getTerms().get(0).getPos().equalsIgnoreCase("V") ? RESOURCE_PROPBANK
397                 : RESOURCE_NOMBANK;
398         String roleset = null;
399         for (final ExternalRef ref : predicate.getExternalRefs()) {
400             if (res.equalsIgnoreCase(ref.getResource())) {
401                 if (ref.getSource() != null) {
402                     roleset = ref.getReference();
403                     break;
404                 } else if (roleset == null) {
405                     roleset = ref.getReference();
406                 }
407             }
408         }
409         return roleset;
410     }
411 
412     // EXTERNAL REFS
413 
414     @Nullable
415     public static ExternalRef getRef(@Nullable final Object annotation,
416             @Nullable final String resource, @Nullable final String reference) {
417         ExternalRef result = null;
418         for (final ExternalRef ref : getRefs(annotation)) {
419             if (matchRef(ref, resource, reference)) {
420                 if (result != null) {
421                     throw new IllegalStateException("Multiple ExternalRef matched for resource "
422                             + resource + ", reference " + reference + ": " + ref.getReference()
423                             + ", " + result.getReference());
424                 }
425                 result = ref;
426             }
427         }
428         return result;
429     }
430 
431     public static List<ExternalRef> getRefs(final Object annotation,
432             @Nullable final String resource, @Nullable final String reference) {
433         final List<ExternalRef> result = Lists.newArrayList();
434         for (final ExternalRef ref : getRefs(annotation)) {
435             if (matchRef(ref, resource, reference)) {
436                 result.add(ref);
437             }
438         }
439         return result;
440     }
441 
442     public static void removeRefs(final Object annotation, @Nullable final String resource,
443             @Nullable final String reference) {
444         final List<ExternalRef> refs = getRefs(annotation);
445         for (final Iterator<ExternalRef> i = refs.iterator(); i.hasNext();) {
446             final ExternalRef ref = i.next();
447             if (matchRef(ref, resource, reference)) {
448                 i.remove();
449             }
450         }
451     }
452 
453     public static void addRef(final Object annotation, final ExternalRef ref) {
454         getRefs(annotation).add(ref);
455     }
456 
457     public static void setRef(final Object annotation, final ExternalRef ref) {
458         removeRefs(annotation, ref.getResource(), ref.getReference());
459         getRefs(annotation).add(ref);
460     }
461 
462     public static String toString(final Object annotation) {
463         if (annotation instanceof Term) {
464             final Term term = (Term) annotation;
465             return "term " + term.getId() + " '" + term + "'";
466         } else if (annotation instanceof Entity) {
467             final Entity entity = (Entity) annotation;
468             return "entity " + entity.getId() + " '" + entity.getStr() + "'";
469         } else if (annotation instanceof Timex3) {
470             final Timex3 timex = (Timex3) annotation;
471             return "timex " + timex.getId() + " '" + timex.getSpan().getStr() + "'";
472         } else if (annotation instanceof Predicate) {
473             final Predicate pred = (Predicate) annotation;
474             return "predicate " + pred.getId() + " '" + pred.getSpan().getStr() + "'";
475         } else if (annotation instanceof Role) {
476             final Role role = (Role) annotation;
477             return "role " + role.getId() + " '" + role.getStr() + "' (" + role.getSemRole() + ")";
478         } else if (annotation instanceof Opinion) {
479             return "opinion " + ((Opinion) annotation).getId();
480         } else if (annotation instanceof OpinionTarget) {
481             return "opinion target '" + ((OpinionTarget) annotation).getSpan().getStr() + "'";
482         } else if (annotation instanceof OpinionHolder) {
483             return "opinion holder '" + ((OpinionHolder) annotation).getSpan().getStr() + "'";
484         } else if (annotation instanceof OpinionExpression) {
485             return "opinion expression '" + ((OpinionExpression) annotation).getSpan().getStr()
486                     + "'";
487         } else if (annotation instanceof Factuality) {
488             final Factuality fact = (Factuality) annotation;
489             return "factuality " + fact.getId() + " '" + fact.getWord().getStr() + "'";
490         } else if (annotation instanceof Coref) {
491             return "coref " + ((Coref) annotation).getId();
492         } else {
493             throw new IllegalArgumentException("Unsupported annotation object: " + annotation);
494         }
495     }
496 
497     private static List<ExternalRef> getRefs(final Object annotation) {
498         List<ExternalRef> refs = ImmutableList.of();
499         if (annotation instanceof Term) {
500             refs = ((Term) annotation).getExternalRefs();
501         } else if (annotation instanceof Entity) {
502             refs = ((Entity) annotation).getExternalRefs();
503         } else if (annotation instanceof Predicate) {
504             refs = ((Predicate) annotation).getExternalRefs();
505         } else if (annotation instanceof Role) {
506             refs = ((Role) annotation).getExternalRefs();
507         } else if (annotation instanceof Opinion) {
508             refs = ((Opinion) annotation).getExternalRefs();
509         } else if (annotation instanceof OpinionExpression) {
510             refs = ((OpinionExpression) annotation).getExternalRefs();
511         } else if (annotation instanceof OpinionTarget) {
512             refs = ((OpinionTarget) annotation).getExternalRefs();
513         } else if (annotation instanceof OpinionHolder) {
514             refs = ((OpinionHolder) annotation).getExternalRefs();
515         } else {
516             throw new IllegalArgumentException("Unsupported annotation object: " + annotation);
517         }
518         return refs;
519     }
520 
521     private static boolean matchRef(final ExternalRef ref, @Nullable final String resource,
522             @Nullable final String reference) {
523         return (resource == null || resource.equalsIgnoreCase(ref.getResource()))
524                 && (reference == null || reference.equals(ref.getReference()));
525     }
526 
527     private static void normalizeRefs(final Collection<ExternalRef> refs) {
528         final Set<String> seen = Sets.newHashSet();
529         for (final Iterator<ExternalRef> i = refs.iterator(); i.hasNext();) {
530             final ExternalRef ref = i.next();
531             final String key = ref.getResource() + "|" + ref.getReference();
532             if (!seen.add(key)) {
533                 i.remove();
534             }
535         }
536     }
537 
538     public static List<Range> termRangesFor(final KAFDocument document, final Iterable<Term> terms) {
539         final List<Range> ranges = Lists.newArrayList();
540         int startIndex = -1;
541         int lastIndex = -2;
542         for (final Term term : Ordering.from(Term.OFFSET_COMPARATOR).sortedCopy(terms)) {
543             final int termIndex = document.getTerms().indexOf(term);
544             if (termIndex - lastIndex > 1) {
545                 if (startIndex >= 0) {
546                     ranges.add(Range.create(startIndex, lastIndex + 1));
547                 }
548                 startIndex = termIndex;
549             }
550             lastIndex = termIndex;
551         }
552         if (startIndex != -1 && lastIndex >= startIndex) {
553             ranges.add(Range.create(startIndex, lastIndex + 1));
554         }
555         return ranges;
556     }
557 
558     public static List<Range> rangesFor(final KAFDocument document, final Iterable<Term> terms) {
559         final List<Range> ranges = Lists.newArrayList();
560         int startOffset = -1;
561         int endOffset = -1;
562         int termIndex = -2;
563         for (final Term term : Ordering.from(Term.OFFSET_COMPARATOR).sortedCopy(terms)) {
564             final int lastTermIndex = termIndex;
565             termIndex = document.getTerms().indexOf(term);
566             if (termIndex - lastTermIndex > 1) {
567                 if (startOffset != -1) {
568                     ranges.add(Range.create(startOffset, endOffset));
569                 }
570                 startOffset = term.getOffset();
571             }
572             endOffset = NAFUtils.getEnd(term);
573         }
574         if (startOffset != -1 && endOffset > startOffset) {
575             ranges.add(Range.create(startOffset, endOffset));
576         }
577         return ranges;
578     }
579 
580     public static Range rangeFor(final Term term) {
581         return Range.create(NAFUtils.getBegin(term), NAFUtils.getEnd(term));
582     }
583 
584     public static Range rangeFor(final Iterable<Term> terms) {
585         int begin = Integer.MAX_VALUE;
586         int end = Integer.MIN_VALUE;
587         for (final Term term : terms) {
588             begin = Math.min(begin, getBegin(term));
589             end = Math.max(end, getEnd(term));
590         }
591         return Range.create(begin, end);
592     }
593 
594     @Nullable
595     public static Span<Term> trimSpan(@Nullable final Span<Term> span, final int sentenceID) {
596         if (span == null || span.isEmpty()) {
597             return null;
598         }
599         boolean sameSentence = true;
600         for (final Term term : span.getTargets()) {
601             if (term.getSent() != sentenceID) {
602                 sameSentence = false;
603                 break;
604             }
605         }
606         if (sameSentence) {
607             return span;
608         }
609         final List<Term> filteredTerms = Lists.newArrayList();
610         for (final Term term : span.getTargets()) {
611             if (term.getSent() == sentenceID) {
612                 filteredTerms.add(term);
613             }
614         }
615         final Span<Term> result = KAFDocument.newTermSpan(filteredTerms);
616         for (final Term head : span.getHeads()) {
617             if (head.getSent() == sentenceID) {
618                 result.getHeads().add(head);
619             }
620         }
621         return result;
622     }
623 
624     // Span methods
625 
626     public static Span<Term> normalizeSpan(final KAFDocument document,
627             @Nullable final Span<Term> span) {
628 
629         // Handle null and empty span
630         if (span == null || Iterables.isEmpty(span.getTargets())) {
631             return KAFDocument.newTermSpan();
632         }
633 
634         // Identify all the 'root' terms in the span whose dep tree parent is outside the span
635         final Set<Term> roots = Sets.newHashSet();
636         final Set<Term> terms = ImmutableSet.copyOf(span.getTargets());
637         for (final Term term : terms) {
638             final Dep dep = document.getDepToTerm(term);
639             if (dep == null || !terms.contains(dep.getFrom())) {
640                 roots.add(term);
641             }
642         }
643 
644         // If only one 'root', return the normalized span having that root as the head
645         if (roots.size() == 1) {
646             return KAFDocument.newTermSpan(span.getTargets(), roots.iterator().next());
647         }
648 
649         // Otherwise, look for the closest head outside the span. First compute all the paths from
650         // the dep tree roots to the 'root' terms identified before
651         final List<List<Term>> paths = Lists.newArrayList();
652         for (final Term root : roots) {
653             final List<Term> path = Lists.newArrayList(root);
654             for (Dep dep = document.getDepToTerm(root); dep != null; dep = document
655                     .getDepToTerm(dep.getFrom())) {
656                 path.add(dep.getFrom());
657             }
658             Collections.reverse(path);
659             paths.add(path);
660         }
661 
662         // Then look for the deepest node common to all those paths
663         int depth = 0;
664         Term externalHead = null;
665         outer: for (; depth < paths.get(0).size(); ++depth) {
666             final Term t = paths.get(0).get(depth);
667             for (int i = 1; i < paths.size(); ++i) {
668                 final List<Term> path = paths.get(i);
669                 if (depth >= path.size() || !path.get(depth).equals(t)) {
670                     break outer;
671                 }
672             }
673             externalHead = t;
674         }
675 
676         // If found, compute the terms for the external span
677         Set<Term> externalTerms = null;
678         if (externalHead != null) {
679             externalTerms = Sets.newHashSet(terms);
680             externalTerms.add(externalHead);
681             for (final List<Term> path : paths) {
682                 externalTerms.addAll(path.subList(depth, path.size()));
683             }
684         }
685 
686         // Now look for the internal head that covers the most part terms of the span. Start by
687         // associating to each candidate internal head the terms it would cover
688         final Multimap<Term, Term> map = HashMultimap.create();
689         for (final Term term : terms) {
690             Dep dep = document.getDepToTerm(term);
691             if (dep == null) {
692                 map.put(term, term);
693             } else {
694                 for (; dep != null; dep = document.getDepToTerm(dep.getFrom())) {
695                     if (!terms.contains(dep.getFrom())) {
696                         map.put(dep.getTo(), term);
697                         break;
698                     }
699                 }
700             }
701         }
702 
703         // Then identify the best internal head
704         Term internalHead = null;
705         Collection<Term> internalTerms = null;
706         for (final Map.Entry<Term, Collection<Term>> entry : map.asMap().entrySet()) {
707             if (internalHead == null || entry.getValue().size() >= internalTerms.size()) {
708                 internalTerms = entry.getValue();
709                 internalHead = entry.getKey();
710             }
711         }
712 
713         // Return either the external span (if defined) or the internal one, based on which one is
714         // most similar in size to the original span (if equal, prefer external one).
715         if (externalTerms != null
716                 && externalTerms.size() - terms.size() <= terms.size() - internalTerms.size()) {
717             return KAFDocument.newTermSpan(
718                     Ordering.from(Term.OFFSET_COMPARATOR).sortedCopy(externalTerms), externalHead);
719         } else {
720             return KAFDocument.newTermSpan(
721                     Ordering.from(Term.OFFSET_COMPARATOR).sortedCopy(internalTerms), internalHead);
722         }
723     }
724 
725     public static List<Span<Term>> mergeSpans(final KAFDocument document,
726             final Iterable<Span<Term>> spans, final boolean canAddTerms) {
727 
728         // Build a map associating to each span head the other heads it is coordinated with
729         final Map<Term, List<Term>> extents = Maps.newHashMap();
730         final Map<Term, Set<Term>> clusters = Maps.newHashMap();
731         for (final Span<Term> span : spans) {
732             final Term head = extractHead(document, span);
733             clusters.put(head, Sets.newHashSet(head));
734             extents.put(head, span.getTargets());
735         }
736         for (final Term head : clusters.keySet()) {
737             for (Dep dep = document.getDepToTerm(head); dep != null
738                     && ("CONJ".equals(dep.getRfunc()) || "COORD".equals(dep.getRfunc())); dep = document
739                     .getDepToTerm(dep.getFrom())) {
740                 if (clusters.keySet().contains(dep.getFrom())) {
741                     clusters.get(head).add(dep.getFrom());
742                     clusters.get(dep.getFrom()).add(head);
743                 } else if ("CO".indexOf(dep.getFrom().getPos()) < 0) {
744                     break; // don't include intermediate terms that are not conjunctions or commas
745                 }
746             }
747         }
748 
749         // Create a span for each cluster of heads, including intermediate conjunctions
750         final List<Span<Term>> result = Lists.newArrayList();
751         while (!clusters.isEmpty()) {
752             final Set<Term> heads = clusters.values().iterator().next();
753             final Set<Term> terms = Sets.newHashSet();
754             Term spanHead = heads.iterator().next();
755             for (final Term head : heads) {
756                 clusters.remove(head);
757                 terms.addAll(extents.get(head));
758                 final List<Term> path = Lists.newArrayList();
759                 for (Dep dep = document.getDepToTerm(head); dep != null; dep = document
760                         .getDepToTerm(dep.getFrom())) {
761                     final Term term = dep.getFrom();
762                     path.add(term);
763                     if (heads.contains(term)) {
764                         terms.addAll(path);
765                         path.clear();
766                         spanHead = term;
767                     }
768                 }
769             }
770             List<Term> spanTerms = Ordering.from(Term.OFFSET_COMPARATOR).sortedCopy(terms);
771             if (canAddTerms) {
772                 final List<Term> docTerms = document.getTerms();
773                 spanTerms = Lists.newArrayList(docTerms.subList(
774                         docTerms.indexOf(spanTerms.get(0)),
775                         docTerms.indexOf(spanTerms.get(spanTerms.size() - 1)) + 1));
776             }
777             result.add(KAFDocument.newTermSpan(spanTerms, spanHead));
778         }
779         return result;
780     }
781 
782     public static final List<Span<Term>> splitSpans(final KAFDocument document,
783             final Iterable<Span<Term>> spans) {
784 
785         // Identify all the heads taking coordination into consideration
786         final Set<Term> heads = Sets.newHashSet();
787         final Set<Term> terms = Sets.newHashSet();
788         for (final Span<Term> span : spans) {
789             final Term head = extractHead(document, span);
790             heads.add(head);
791             terms.addAll(span.getTargets());
792 
793             final List<Term> queue = Lists.newLinkedList();
794             queue.add(head);
795             while (!queue.isEmpty()) {
796                 final Term term = queue.remove(0);
797                 for (final Dep dep : document.getDepsFromTerm(term)) {
798                     final String func = dep.getRfunc();
799                     if ("COORD".equals(func) || "CONJ".equals(func)) {
800                         final Term t = dep.getTo();
801                         queue.add(t);
802                         if ("CC".equals(t.getMorphofeat())
803                                 || !Character.isLetter(t.getMorphofeat().charAt(0))) {
804                             heads.add(term);
805                         }
806                     }
807                 }
808             }
809         }
810 
811         // Build and return a span for each head
812         final Set<Term> excluded = document.getTermsByDepDescendants(heads);
813         final List<Span<Term>> result = Lists.newArrayList();
814         for (final Term head : heads) {
815             final Set<Term> extent = document.getTermsByDepAncestors(ImmutableSet.of(head));
816             extent.removeAll(excluded);
817             extent.add(head);
818             extent.retainAll(terms);
819             if (!extent.isEmpty()) {
820                 result.add(KAFDocument.newTermSpan(Ordering.from(Term.OFFSET_COMPARATOR)
821                         .sortedCopy(extent), head));
822             }
823         }
824         return result;
825     }
826 
827     public static final List<Span<Term>> splitSpan(final KAFDocument document,
828             final Span<Term> span, final Iterable<Term> heads) {
829 
830         final Set<Term> excludedTerms = document.getTermsByDepDescendants(heads);
831         final List<Span<Term>> spans = Lists.newArrayList();
832         for (final Term head : heads) {
833             final Set<Term> terms = document.getTermsByDepAncestors(ImmutableSet.of(head));
834             terms.removeAll(excludedTerms);
835             terms.add(head);
836             terms.retainAll(span.getTargets());
837             if (!terms.isEmpty()) {
838                 spans.add(KAFDocument.newTermSpan(Ordering.from(Term.OFFSET_COMPARATOR)
839                         .sortedCopy(terms), head));
840             }
841         }
842         return spans;
843     }
844 
845     // End
846 
847     public static KAFDocument readDocument(@Nullable final Path path) throws IOException {
848         final KAFDocument document;
849         if (path == null) {
850             document = KAFDocument.createFromStream(IO.utf8Reader(IO.buffer(System.in)));
851             document.getPublic().publicId = "";
852         } else {
853             try (BufferedReader reader = Files.newBufferedReader(path)) {
854                 document = KAFDocument.createFromStream(reader);
855                 document.getPublic().publicId = path.toString();
856             }
857         }
858         return document;
859     }
860 
861     public static void writeDocument(final KAFDocument document, @Nullable final Path location)
862             throws IOException {
863         if (location == null) {
864             System.out.write(document.toString().getBytes(Charsets.UTF_8));
865         } else {
866             try (Writer writer = IO.utf8Writer(IO.buffer(IO.write(location.toString())))) {
867                 writer.write(document.toString());
868             }
869         }
870     }
871 
872     //todo adapt DEP (UD): check VC and IM
873     public static Term syntacticToSRLHead(final KAFDocument document, final Term term) {
874         for (final Dep dep : document.getDepsFromTerm(term)) {
875             final String func = dep.getRfunc();
876             if ("VC".equals(func) || "IM".equals(func)) {
877                 return syntacticToSRLHead(document, dep.getTo());
878             }
879         }
880         return term;
881     }
882 
883     public static Term srlToSyntacticHead(final KAFDocument document, final Term term) {
884         final Dep dep = document.getDepToTerm(term);
885         if (dep != null) {
886             final String func = dep.getRfunc();
887             if ("VC".equals(func) || "IM".equals(func)) {
888                 return srlToSyntacticHead(document, dep.getFrom());
889             }
890         }
891         return term;
892     }
893 
894     // Accounts for demonstrative pronouns
895 
896     public static String extendedPos(final KAFDocument document, final Term term) {
897         final String pos = term.getMorphofeat();
898         final String lemma = term.getLemma().toLowerCase();
899         if ("some".equals(lemma) || "many".equals(lemma) || "all".equals(lemma)
900                 || "few".equals(lemma) || "this".equals(lemma) || "these".equals(lemma)
901                 || "that".equals(lemma) || "those".equals(lemma)) {
902             final Dep dep = document.getDepToTerm(term);
903             if (dep == null || !"NMOD".equals(dep.getRfunc())) {
904                 return pos + "P"; // determiner (DT) or adj (JJ) used as demonstrative pronoun
905             }
906         }
907         return pos;
908     }
909 
910     public static Boolean isActiveForm(final KAFDocument document, final Term term) {
911         final String word = term.getStr().toLowerCase();
912         final String pos = term.getMorphofeat();
913         if (!pos.startsWith("V")) {
914             return null;
915         }
916         if (word.equals("been") || !pos.equals("VBN")) {
917             return Boolean.TRUE;
918         }
919         return isActiveFormHelper(document, term);
920     }
921 
922     private static Boolean isActiveFormHelper(final KAFDocument document, final Term term) {
923         final Dep dep = document.getDepToTerm(term);
924         if (dep == null) {
925             return Boolean.FALSE;
926         }
927         final Term parent = dep.getFrom();
928         final String word = parent.getStr().toLowerCase();
929         final String pos = parent.getMorphofeat();
930         if (pos.startsWith("NN")) {
931             return Boolean.FALSE;
932         }
933         if (word.matches("am|are|is|was|were|be|been|being")) {
934             return Boolean.FALSE;
935         }
936         if (word.matches("ha(ve|s|d|ving)")) {
937             return Boolean.TRUE;
938         }
939 
940         if (pos.matches("VBZ|VBD|VBP|MD")) {
941             return Boolean.FALSE;
942         }
943         return isActiveFormHelper(document, parent);
944     }
945 
946     public static java.util.function.Predicate<Term> matchExtendedPos(final KAFDocument document,
947             final String... posPrefixes) {
948         return new java.util.function.Predicate<Term>() {
949 
950             @Override
951             public boolean test(final Term term) {
952                 final String pos = extendedPos(document, term);
953                 for (final String prefix : posPrefixes) {
954                     if (pos.startsWith(prefix)) {
955                         return true;
956                     }
957                 }
958                 return false;
959             }
960 
961         };
962     }
963 
964     // extracts descendents that are consecutive with the supplied head
965     public static Set<Term> getTermsByDepAncestor(final KAFDocument document, final Term head,
966             final boolean consecutive) {
967         final Set<Term> descendants = document.getTermsByDepAncestors(ImmutableSet.of(head));
968         if (consecutive) {
969             final List<Term> sortedTerms = Ordering.from(Term.OFFSET_COMPARATOR).sortedCopy(
970                     descendants);
971             final int[] indexes = new int[sortedTerms.size()];
972             for (int i = 0; i < sortedTerms.size(); ++i) {
973                 indexes[i] = document.getTerms().indexOf(sortedTerms.get(i));
974             }
975             final int h = sortedTerms.indexOf(head);
976             boolean filtered = false;
977             for (int i = h + 1; i < indexes.length; ++i) {
978                 filtered |= indexes[i] > indexes[i - 1] + 1;
979                 if (filtered) {
980                     descendants.remove(sortedTerms.get(i));
981                 }
982             }
983             filtered = false;
984             for (int i = h - 1; i >= 0; --i) {
985                 filtered |= indexes[i] < indexes[i + 1] - 1;
986                 if (filtered) {
987                     descendants.remove(sortedTerms.get(i));
988                 }
989             }
990         }
991         return descendants;
992     }
993 
994 
995     public static IRI createPreMOnSemanticClassIRIfor(String model, String predicate){
996 
997         String prefix = "";
998         switch (model) {
999 
1000             case RESOURCE_FRAMENET : prefix+=PREMON_FNPREFIX+"-"; break;
1001             case RESOURCE_VERBNET : prefix+=PREMON_VNPREFIX+"-"; break;
1002             case RESOURCE_PROPBANK  : prefix+=PREMON_PBPREFIX+"-"; break;
1003             case RESOURCE_NOMBANK  : prefix+=PREMON_NBPREFIX+"-"; break;
1004 
1005         }
1006 
1007         //works for fn15,pb17,vn32,nb10... in case of other version, some cautions have to be take on predicate (e.g.m FedEx or UPS in pb215)
1008         String localname=prefix+predicate.toLowerCase();
1009 
1010         return Statements.VALUE_FACTORY.createIRI(PREMON_NAMESPACE, localname);
1011 
1012     }
1013 
1014 
1015     public static IRI createPreMOnSemanticRoleIRIfor(String model, String predicate, String role){
1016 
1017         String prefix = "";
1018 
1019         //works for fn15,pb17,vn32,nb10... in case of other version, some cautions have to be take on predicate (e.g.m FedEx or UPS in pb215)
1020         //expect role as follow
1021         //PB,NB: A0,AA, AM-TMP
1022         //VB,FN: don't care
1023         switch (model) {
1024             case RESOURCE_FRAMENET : prefix+=PREMON_FNPREFIX+"-";
1025                 role=role.toLowerCase();
1026                 break;
1027             case RESOURCE_VERBNET : prefix+=PREMON_VNPREFIX+"-";
1028                 role=role.toLowerCase();
1029                 break;
1030             case RESOURCE_PROPBANK  : prefix+=PREMON_PBPREFIX+"-";
1031                 role=role.toLowerCase();//.replace("arg-","a").replace("a","arg");
1032                 if (!role.contains("am-")) role=role.replace("a","arg");
1033                 else role=role.replace("am-","arg");
1034                 break;
1035             case RESOURCE_NOMBANK  : prefix+=PREMON_NBPREFIX+"-";
1036                 role=role.toLowerCase();//.replace("arg-","a").replace("a","arg");
1037                 if (!role.contains("am-")) role=role.replace("a","arg");
1038                 else role=role.replace("am-","arg");
1039                 break;
1040         }
1041 
1042         String localname=prefix+predicate.toLowerCase()+PREMON_ARGUMENT_SEPARATOR+role;
1043 
1044         return Statements.VALUE_FACTORY.createIRI(PREMON_NAMESPACE, localname);
1045 
1046     }
1047 
1048 }