1 package eu.fbk.dkm.pikes.resources;
2
3 import java.io.BufferedReader;
4 import java.io.IOException;
5 import java.io.Writer;
6 import java.nio.file.Files;
7 import java.nio.file.Path;
8 import java.util.Collection;
9 import java.util.Collections;
10 import java.util.Iterator;
11 import java.util.List;
12 import java.util.Map;
13 import java.util.Set;
14 import java.util.regex.Pattern;
15
16 import javax.annotation.Nullable;
17
18 import com.google.common.base.Charsets;
19 import com.google.common.collect.HashMultimap;
20 import com.google.common.collect.ImmutableList;
21 import com.google.common.collect.ImmutableSet;
22 import com.google.common.collect.Iterables;
23 import com.google.common.collect.Lists;
24 import com.google.common.collect.Maps;
25 import com.google.common.collect.Multimap;
26 import com.google.common.collect.Ordering;
27 import com.google.common.collect.Sets;
28
29 import eu.fbk.rdfpro.util.Statements;
30 import eu.fbk.utils.core.Range;
31 import ixa.kaflib.Coref;
32 import ixa.kaflib.Dep;
33 import ixa.kaflib.Entity;
34 import ixa.kaflib.ExternalRef;
35 import ixa.kaflib.Factuality;
36 import ixa.kaflib.KAFDocument;
37 import ixa.kaflib.Opinion;
38 import ixa.kaflib.Opinion.OpinionExpression;
39 import ixa.kaflib.Opinion.OpinionHolder;
40 import ixa.kaflib.Opinion.OpinionTarget;
41 import ixa.kaflib.Predicate;
42 import ixa.kaflib.Predicate.Role;
43 import ixa.kaflib.Span;
44 import ixa.kaflib.Term;
45 import ixa.kaflib.Timex3;
46 import ixa.kaflib.WF;
47
48 import eu.fbk.rdfpro.util.IO;
49 import org.eclipse.rdf4j.model.IRI;
50
51 public final class NAFUtils {
52
53 public static final String RESOURCE_PROPBANK = "PropBank";
54
55 public static final String RESOURCE_NOMBANK = "NomBank";
56
57 public static final String RESOURCE_VERBNET = "VerbNet";
58
59 public static final String RESOURCE_FRAMENET = "FrameNet";
60
61 public static final String RESOURCE_BBN = "BBN";
62
63 public static final String RESOURCE_WN_SYNSET = "wn30-ukb";
64
65 public static final String RESOURCE_WN_SST = "wn30-sst";
66
67 public static final String RESOURCE_SUMO = "SUMO";
68
69 public static final String RESOURCE_ENTITY_REF = "NAFFilter-EntityRef";
70
71 public static final String RESOURCE_ENTITY_COREF = "NAFFilter-EntityCoref";
72
73 public static final String RESOURCE_PREDICATE_REF = "NAFFilter-PredicateRef";
74
75 public static final String RESOURCE_PREDICATE_COREF = "NAFFilter-PredicateCoref";
76
77 public static final String RESOURCE_TIMEX_REF = "NAFFilter-TimexRef";
78
79 public static final String RESOURCE_TIMEX_COREF = "NAFFilter-TimexCoref";
80
81 public static final String RESOURCE_VALUE = "value";
82
83 public static final String RESOURCE_YAGO = "Yago";
84
85 public static final String PREMON_NAMESPACE = "http://premon.fbk.eu/resource/";
86 public static final String PREMON_FNPREFIX = "fn15";
87 public static final String PREMON_VNPREFIX = "vb32";
88 public static final String PREMON_PBPREFIX = "pb17";
89 public static final String PREMON_NBPREFIX = "nb10";
90 public static final String PREMON_ARGUMENT_SEPARATOR = "@";
91 public static final String PREMON_RESOURCE_PROPBANK = "PreMOn+PropBank";
92
93 public static final String PREMON_RESOURCE_NOMBANK = "PreMOn+NomBank";
94
95 public static final String PREMON_RESOURCE_VERBNET = "PreMOn+VerbNet";
96
97 public static final String PREMON_RESOURCE_FRAMENET = "PreMOn+FrameNet";
98
99 public static final Ordering<Opinion> OPINION_COMPARATOR = new Ordering<Opinion>() {
100
101 @Override
102 public int compare(final Opinion left, final Opinion right) {
103 final int leftOffset = left.getOpinionExpression().getSpan().getTargets().get(0)
104 .getOffset();
105 final int rightOffset = right.getOpinionExpression().getSpan().getTargets().get(0)
106 .getOffset();
107 return leftOffset - rightOffset;
108 }
109
110 };
111
112 private static final Pattern WF_EXCLUSION_PATTERN = Pattern.compile("[^A-Za-z0-9]*");
113
114 private static final Set<String> SYMBOLS = ImmutableSet.of("$", "#", "&", "€");
115
116 public static void normalize(final KAFDocument document) {
117
118
119 for (final Term term : document.getTerms()) {
120 boolean hasBBN = false;
121 boolean hasSynset = false;
122 boolean hasSST = false;
123 for (final ExternalRef ref : term.getExternalRefs()) {
124 hasBBN |= RESOURCE_BBN.equalsIgnoreCase(ref.getResource());
125 hasSynset |= RESOURCE_WN_SYNSET.equalsIgnoreCase(ref.getResource());
126 hasSST |= RESOURCE_WN_SST.equalsIgnoreCase(ref.getResource());
127 }
128 if (!hasBBN && term.getBBNTag() != null) {
129 term.addExternalRef(document.newExternalRef(RESOURCE_BBN, term.getBBNTag()));
130 }
131 if (!hasSynset && term.getWordnetSense() != null) {
132 term.addExternalRef(document.newExternalRef(RESOURCE_WN_SYNSET,
133 term.getWordnetSense()));
134 }
135 if (!hasSST && term.getSupersenseTag() != null) {
136 term.addExternalRef(document.newExternalRef(RESOURCE_WN_SST,
137 term.getSupersenseTag()));
138 }
139 term.setBBNTag(null);
140 term.setWordnetSense(null);
141 term.setSupersenseTag(null);
142 }
143
144
145 for (final Predicate predicate : document.getPredicates()) {
146 normalizeRefs(getRefs(predicate));
147 for (final Role role : predicate.getRoles()) {
148 normalizeRefs(getRefs(role));
149 }
150 }
151 }
152
153 public static List<Term> filterTerms(final Iterable<Term> terms) {
154 final List<Term> result = Lists.newArrayList();
155 boolean atBeginning = true;
156 for (final Term term : Ordering.from(Term.OFFSET_COMPARATOR).sortedCopy(terms)) {
157 final char pos = Character.toUpperCase(term.getPos().charAt(0));
158 if (atBeginning && (pos == 'D' || pos == 'P')) {
159 continue;
160 }
161 for (final WF word : term.getWFs()) {
162 final String text = word.getForm();
163 if (SYMBOLS.contains(text) || !WF_EXCLUSION_PATTERN.matcher(text).matches()) {
164 result.add(term);
165 atBeginning = false;
166 break;
167 }
168 }
169 }
170 return result;
171 }
172
173 public static String getText(final Iterable<Term> terms) {
174 final StringBuilder builder = new StringBuilder();
175 boolean atBeginning = true;
176 for (final Term term : Ordering.from(Term.OFFSET_COMPARATOR).sortedCopy(terms)) {
177 final boolean properNoun = term.getMorphofeat().startsWith("NNP");
178 for (final WF word : term.getWFs()) {
179 builder.append(atBeginning ? "" : " ");
180 builder.append(properNoun ? word.getForm() : word.getForm().toLowerCase());
181 atBeginning = false;
182 }
183 }
184 return builder.toString();
185 }
186
187 @Nullable
188
189 public static Term extractHead(final KAFDocument document, @Nullable final Span<Term> span) {
190 if (span == null) {
191 return null;
192 }
193 Term head = null;
194 if (head == null) {
195
196 head = document.getTermsHead(span.getTargets());
197 }
198 return head;
199 }
200
201 public static Set<Term> extractHeads(final KAFDocument document,
202 @Nullable final Iterable<Term> ancestors, @Nullable final Iterable<Term> span,
203 @Nullable final java.util.function.Predicate<Term> predicate) {
204
205 Set<Term> ancestorSet;
206 if (ancestors != null) {
207 ancestorSet = ImmutableSet.copyOf(ancestors);
208 } else {
209 ancestorSet = Sets.newHashSet();
210 final Set<Term> termSet = Sets.newHashSet(span);
211 for (final Term term : termSet) {
212 final Dep dep = document.getDepToTerm(term);
213 if (dep == null || !termSet.contains(dep.getFrom())) {
214 ancestorSet.add(term);
215 }
216 }
217 }
218
219 final Set<Term> result = Sets.newHashSet();
220 for (final Term ancestor : ancestorSet) {
221 extractHeadsHelper(document, ancestor, predicate, result);
222 }
223 if (span != null) {
224 result.retainAll(ImmutableSet.copyOf(span));
225 }
226
227
228 return result;
229 }
230
231
232
233 private static boolean extractHeadsHelper(final KAFDocument document, final Term term,
234 final java.util.function.Predicate<Term> predicate, final Collection<Term> result) {
235 final String pos = extendedPos(document, term);
236 boolean accepted = false;
237 if (pos.startsWith("V")) {
238 final Term srlHead = syntacticToSRLHead(document, term);
239 if (!term.equals(srlHead)) {
240 accepted = extractHeadsHelper(document, srlHead, predicate, result);
241 }
242 }
243 if (!accepted && (predicate == null || predicate.test(term))) {
244 result.add(term);
245 accepted = true;
246 }
247 if (accepted) {
248 for (final Dep dep : document.getDepsFromTerm(term)) {
249 if (dep.getRfunc().toUpperCase().contains("COORD")) {
250 extractHeadsHelper(document, dep.getTo(), predicate, result);
251 }
252 }
253 } else {
254 for (final Dep dep : document.getDepsFromTerm(term)) {
255 extractHeadsHelper(document, dep.getTo(), predicate, result);
256 }
257 }
258 return accepted;
259 }
260
261 public static boolean hasHead(final KAFDocument document, final Object annotation,
262 final Term head) {
263 List<Span<Term>> spans;
264 if (annotation instanceof Coref) {
265 spans = ((Coref) annotation).getSpans();
266 } else if (annotation instanceof Entity) {
267 spans = ((Entity) annotation).getSpans();
268 } else if (annotation instanceof Timex3) {
269 spans = ImmutableList.of(KAFDocument.newTermSpan(document
270 .getTermsByWFs(((Timex3) annotation).getSpan().getTargets())));
271 } else if (annotation instanceof Predicate) {
272 spans = ImmutableList.of(((Predicate) annotation).getSpan());
273 } else if (annotation instanceof Role) {
274 spans = ImmutableList.of(((Role) annotation).getSpan());
275 } else {
276 throw new IllegalArgumentException("Unsupported annotation: " + annotation);
277 }
278 for (final Span<Term> span : spans) {
279 if (head == extractHead(document, span)) {
280 return true;
281 }
282 }
283 return false;
284 }
285
286 public static Span<Term> getNominalSpan(final KAFDocument document, final Term term,
287 final boolean includeCoord, final boolean includeModifiers) {
288
289
290 final Set<Term> terms = Sets.newHashSet(term);
291
292
293 final Map<Term, List<Term>> markables = Maps.newHashMap();
294 for (final Entity entity : document.getEntitiesByTerm(term)) {
295 markables.put(document.getTermsHead(entity.getTerms()), entity.getTerms());
296 }
297 for (final WF wf : term.getWFs()) {
298 for (final Timex3 timex : document.getTimeExsByWF(wf)) {
299 final List<Term> span = document.getTermsByWFs(timex.getSpan().getTargets());
300 markables.put(document.getTermsHead(span), span);
301 }
302 }
303
304
305
306 if (!markables.isEmpty()) {
307 Term t = term;
308 while (true) {
309 final List<Term> parent = markables.get(t);
310 if (parent != null) {
311 terms.addAll(parent);
312 break;
313 }
314 final Dep dep = document.getDepToTerm(t);
315 if (dep == null) {
316 break;
317 }
318 t = dep.getFrom();
319 }
320 }
321
322
323 final Term head = document.getTermsHead(terms);
324
325
326 final String regex = includeCoord ? includeModifiers ? "(COORD CONJ?)* ((NAME|NMOD|AMOD|TMP) .*)?"
327 : "(COORD CONJ?)* NAME"
328 : includeModifiers ? "((NAME|NMOD|AMOD|TMP) .*)?" : "NAME";
329 terms.addAll(document.getTermsByDepAncestors(Collections.singleton(head), regex));
330
331
332 return KAFDocument.newTermSpan(Ordering.from(Term.OFFSET_COMPARATOR).sortedCopy(terms),
333 head);
334 }
335
336 @Nullable
337 public static String extractLemma(@Nullable final String rolesetOrRole) {
338 if (rolesetOrRole == null) {
339 return null;
340 }
341 int index = rolesetOrRole.indexOf('.');
342 if (index < 0) {
343 index = rolesetOrRole.indexOf('@');
344 }
345 return (index >= 0 ? rolesetOrRole.substring(0, index) : rolesetOrRole).toLowerCase();
346 }
347
348 @Nullable
349 public static Integer extractSense(@Nullable final String rolesetOrRole) {
350 if (rolesetOrRole == null) {
351 return null;
352 }
353 final int start = Math.max(0, rolesetOrRole.indexOf('.') + 1);
354 int end = rolesetOrRole.indexOf('@');
355 end = end > 0 ? end : rolesetOrRole.length();
356 try {
357 return Integer.valueOf(rolesetOrRole.substring(start, end));
358 } catch (final Throwable ex) {
359 return null;
360 }
361 }
362
363 @Nullable
364 public static Integer extractArgNum(@Nullable final String role) {
365 if (role == null) {
366 return null;
367 }
368 int index = role.length();
369 while (index > 0 && Character.isDigit(role.charAt(index - 1))) {
370 --index;
371 }
372 return index == role.length() ? null : Integer.valueOf(role.substring(index));
373 }
374
375
376
377 public static int getBegin(final Term term) {
378 return term.getOffset();
379 }
380
381 public static int getEnd(final Term term) {
382 final List<WF> wfs = term.getWFs();
383 final WF wf = wfs.get(wfs.size() - 1);
384 final String str = wf.getForm();
385 if (str.equals("-LSB-") || str.equals("-RSB-") || str.equals("''")) {
386 return wf.getOffset() + 1;
387 }
388 return wf.getOffset() + wf.getLength();
389 }
390
391 public static int getLength(final Term term) {
392 return getEnd(term) - term.getOffset();
393 }
394
395 public static String getRoleset(final Predicate predicate) {
396 final String res = predicate.getTerms().get(0).getPos().equalsIgnoreCase("V") ? RESOURCE_PROPBANK
397 : RESOURCE_NOMBANK;
398 String roleset = null;
399 for (final ExternalRef ref : predicate.getExternalRefs()) {
400 if (res.equalsIgnoreCase(ref.getResource())) {
401 if (ref.getSource() != null) {
402 roleset = ref.getReference();
403 break;
404 } else if (roleset == null) {
405 roleset = ref.getReference();
406 }
407 }
408 }
409 return roleset;
410 }
411
412
413
414 @Nullable
415 public static ExternalRef getRef(@Nullable final Object annotation,
416 @Nullable final String resource, @Nullable final String reference) {
417 ExternalRef result = null;
418 for (final ExternalRef ref : getRefs(annotation)) {
419 if (matchRef(ref, resource, reference)) {
420 if (result != null) {
421 throw new IllegalStateException("Multiple ExternalRef matched for resource "
422 + resource + ", reference " + reference + ": " + ref.getReference()
423 + ", " + result.getReference());
424 }
425 result = ref;
426 }
427 }
428 return result;
429 }
430
431 public static List<ExternalRef> getRefs(final Object annotation,
432 @Nullable final String resource, @Nullable final String reference) {
433 final List<ExternalRef> result = Lists.newArrayList();
434 for (final ExternalRef ref : getRefs(annotation)) {
435 if (matchRef(ref, resource, reference)) {
436 result.add(ref);
437 }
438 }
439 return result;
440 }
441
442 public static void removeRefs(final Object annotation, @Nullable final String resource,
443 @Nullable final String reference) {
444 final List<ExternalRef> refs = getRefs(annotation);
445 for (final Iterator<ExternalRef> i = refs.iterator(); i.hasNext();) {
446 final ExternalRef ref = i.next();
447 if (matchRef(ref, resource, reference)) {
448 i.remove();
449 }
450 }
451 }
452
453 public static void addRef(final Object annotation, final ExternalRef ref) {
454 getRefs(annotation).add(ref);
455 }
456
457 public static void setRef(final Object annotation, final ExternalRef ref) {
458 removeRefs(annotation, ref.getResource(), ref.getReference());
459 getRefs(annotation).add(ref);
460 }
461
462 public static String toString(final Object annotation) {
463 if (annotation instanceof Term) {
464 final Term term = (Term) annotation;
465 return "term " + term.getId() + " '" + term + "'";
466 } else if (annotation instanceof Entity) {
467 final Entity entity = (Entity) annotation;
468 return "entity " + entity.getId() + " '" + entity.getStr() + "'";
469 } else if (annotation instanceof Timex3) {
470 final Timex3 timex = (Timex3) annotation;
471 return "timex " + timex.getId() + " '" + timex.getSpan().getStr() + "'";
472 } else if (annotation instanceof Predicate) {
473 final Predicate pred = (Predicate) annotation;
474 return "predicate " + pred.getId() + " '" + pred.getSpan().getStr() + "'";
475 } else if (annotation instanceof Role) {
476 final Role role = (Role) annotation;
477 return "role " + role.getId() + " '" + role.getStr() + "' (" + role.getSemRole() + ")";
478 } else if (annotation instanceof Opinion) {
479 return "opinion " + ((Opinion) annotation).getId();
480 } else if (annotation instanceof OpinionTarget) {
481 return "opinion target '" + ((OpinionTarget) annotation).getSpan().getStr() + "'";
482 } else if (annotation instanceof OpinionHolder) {
483 return "opinion holder '" + ((OpinionHolder) annotation).getSpan().getStr() + "'";
484 } else if (annotation instanceof OpinionExpression) {
485 return "opinion expression '" + ((OpinionExpression) annotation).getSpan().getStr()
486 + "'";
487 } else if (annotation instanceof Factuality) {
488 final Factuality fact = (Factuality) annotation;
489 return "factuality " + fact.getId() + " '" + fact.getWord().getStr() + "'";
490 } else if (annotation instanceof Coref) {
491 return "coref " + ((Coref) annotation).getId();
492 } else {
493 throw new IllegalArgumentException("Unsupported annotation object: " + annotation);
494 }
495 }
496
497 private static List<ExternalRef> getRefs(final Object annotation) {
498 List<ExternalRef> refs = ImmutableList.of();
499 if (annotation instanceof Term) {
500 refs = ((Term) annotation).getExternalRefs();
501 } else if (annotation instanceof Entity) {
502 refs = ((Entity) annotation).getExternalRefs();
503 } else if (annotation instanceof Predicate) {
504 refs = ((Predicate) annotation).getExternalRefs();
505 } else if (annotation instanceof Role) {
506 refs = ((Role) annotation).getExternalRefs();
507 } else if (annotation instanceof Opinion) {
508 refs = ((Opinion) annotation).getExternalRefs();
509 } else if (annotation instanceof OpinionExpression) {
510 refs = ((OpinionExpression) annotation).getExternalRefs();
511 } else if (annotation instanceof OpinionTarget) {
512 refs = ((OpinionTarget) annotation).getExternalRefs();
513 } else if (annotation instanceof OpinionHolder) {
514 refs = ((OpinionHolder) annotation).getExternalRefs();
515 } else {
516 throw new IllegalArgumentException("Unsupported annotation object: " + annotation);
517 }
518 return refs;
519 }
520
521 private static boolean matchRef(final ExternalRef ref, @Nullable final String resource,
522 @Nullable final String reference) {
523 return (resource == null || resource.equalsIgnoreCase(ref.getResource()))
524 && (reference == null || reference.equals(ref.getReference()));
525 }
526
527 private static void normalizeRefs(final Collection<ExternalRef> refs) {
528 final Set<String> seen = Sets.newHashSet();
529 for (final Iterator<ExternalRef> i = refs.iterator(); i.hasNext();) {
530 final ExternalRef ref = i.next();
531 final String key = ref.getResource() + "|" + ref.getReference();
532 if (!seen.add(key)) {
533 i.remove();
534 }
535 }
536 }
537
538 public static List<Range> termRangesFor(final KAFDocument document, final Iterable<Term> terms) {
539 final List<Range> ranges = Lists.newArrayList();
540 int startIndex = -1;
541 int lastIndex = -2;
542 for (final Term term : Ordering.from(Term.OFFSET_COMPARATOR).sortedCopy(terms)) {
543 final int termIndex = document.getTerms().indexOf(term);
544 if (termIndex - lastIndex > 1) {
545 if (startIndex >= 0) {
546 ranges.add(Range.create(startIndex, lastIndex + 1));
547 }
548 startIndex = termIndex;
549 }
550 lastIndex = termIndex;
551 }
552 if (startIndex != -1 && lastIndex >= startIndex) {
553 ranges.add(Range.create(startIndex, lastIndex + 1));
554 }
555 return ranges;
556 }
557
558 public static List<Range> rangesFor(final KAFDocument document, final Iterable<Term> terms) {
559 final List<Range> ranges = Lists.newArrayList();
560 int startOffset = -1;
561 int endOffset = -1;
562 int termIndex = -2;
563 for (final Term term : Ordering.from(Term.OFFSET_COMPARATOR).sortedCopy(terms)) {
564 final int lastTermIndex = termIndex;
565 termIndex = document.getTerms().indexOf(term);
566 if (termIndex - lastTermIndex > 1) {
567 if (startOffset != -1) {
568 ranges.add(Range.create(startOffset, endOffset));
569 }
570 startOffset = term.getOffset();
571 }
572 endOffset = NAFUtils.getEnd(term);
573 }
574 if (startOffset != -1 && endOffset > startOffset) {
575 ranges.add(Range.create(startOffset, endOffset));
576 }
577 return ranges;
578 }
579
580 public static Range rangeFor(final Term term) {
581 return Range.create(NAFUtils.getBegin(term), NAFUtils.getEnd(term));
582 }
583
584 public static Range rangeFor(final Iterable<Term> terms) {
585 int begin = Integer.MAX_VALUE;
586 int end = Integer.MIN_VALUE;
587 for (final Term term : terms) {
588 begin = Math.min(begin, getBegin(term));
589 end = Math.max(end, getEnd(term));
590 }
591 return Range.create(begin, end);
592 }
593
594 @Nullable
595 public static Span<Term> trimSpan(@Nullable final Span<Term> span, final int sentenceID) {
596 if (span == null || span.isEmpty()) {
597 return null;
598 }
599 boolean sameSentence = true;
600 for (final Term term : span.getTargets()) {
601 if (term.getSent() != sentenceID) {
602 sameSentence = false;
603 break;
604 }
605 }
606 if (sameSentence) {
607 return span;
608 }
609 final List<Term> filteredTerms = Lists.newArrayList();
610 for (final Term term : span.getTargets()) {
611 if (term.getSent() == sentenceID) {
612 filteredTerms.add(term);
613 }
614 }
615 final Span<Term> result = KAFDocument.newTermSpan(filteredTerms);
616 for (final Term head : span.getHeads()) {
617 if (head.getSent() == sentenceID) {
618 result.getHeads().add(head);
619 }
620 }
621 return result;
622 }
623
624
625
626 public static Span<Term> normalizeSpan(final KAFDocument document,
627 @Nullable final Span<Term> span) {
628
629
630 if (span == null || Iterables.isEmpty(span.getTargets())) {
631 return KAFDocument.newTermSpan();
632 }
633
634
635 final Set<Term> roots = Sets.newHashSet();
636 final Set<Term> terms = ImmutableSet.copyOf(span.getTargets());
637 for (final Term term : terms) {
638 final Dep dep = document.getDepToTerm(term);
639 if (dep == null || !terms.contains(dep.getFrom())) {
640 roots.add(term);
641 }
642 }
643
644
645 if (roots.size() == 1) {
646 return KAFDocument.newTermSpan(span.getTargets(), roots.iterator().next());
647 }
648
649
650
651 final List<List<Term>> paths = Lists.newArrayList();
652 for (final Term root : roots) {
653 final List<Term> path = Lists.newArrayList(root);
654 for (Dep dep = document.getDepToTerm(root); dep != null; dep = document
655 .getDepToTerm(dep.getFrom())) {
656 path.add(dep.getFrom());
657 }
658 Collections.reverse(path);
659 paths.add(path);
660 }
661
662
663 int depth = 0;
664 Term externalHead = null;
665 outer: for (; depth < paths.get(0).size(); ++depth) {
666 final Term t = paths.get(0).get(depth);
667 for (int i = 1; i < paths.size(); ++i) {
668 final List<Term> path = paths.get(i);
669 if (depth >= path.size() || !path.get(depth).equals(t)) {
670 break outer;
671 }
672 }
673 externalHead = t;
674 }
675
676
677 Set<Term> externalTerms = null;
678 if (externalHead != null) {
679 externalTerms = Sets.newHashSet(terms);
680 externalTerms.add(externalHead);
681 for (final List<Term> path : paths) {
682 externalTerms.addAll(path.subList(depth, path.size()));
683 }
684 }
685
686
687
688 final Multimap<Term, Term> map = HashMultimap.create();
689 for (final Term term : terms) {
690 Dep dep = document.getDepToTerm(term);
691 if (dep == null) {
692 map.put(term, term);
693 } else {
694 for (; dep != null; dep = document.getDepToTerm(dep.getFrom())) {
695 if (!terms.contains(dep.getFrom())) {
696 map.put(dep.getTo(), term);
697 break;
698 }
699 }
700 }
701 }
702
703
704 Term internalHead = null;
705 Collection<Term> internalTerms = null;
706 for (final Map.Entry<Term, Collection<Term>> entry : map.asMap().entrySet()) {
707 if (internalHead == null || entry.getValue().size() >= internalTerms.size()) {
708 internalTerms = entry.getValue();
709 internalHead = entry.getKey();
710 }
711 }
712
713
714
715 if (externalTerms != null
716 && externalTerms.size() - terms.size() <= terms.size() - internalTerms.size()) {
717 return KAFDocument.newTermSpan(
718 Ordering.from(Term.OFFSET_COMPARATOR).sortedCopy(externalTerms), externalHead);
719 } else {
720 return KAFDocument.newTermSpan(
721 Ordering.from(Term.OFFSET_COMPARATOR).sortedCopy(internalTerms), internalHead);
722 }
723 }
724
725 public static List<Span<Term>> mergeSpans(final KAFDocument document,
726 final Iterable<Span<Term>> spans, final boolean canAddTerms) {
727
728
729 final Map<Term, List<Term>> extents = Maps.newHashMap();
730 final Map<Term, Set<Term>> clusters = Maps.newHashMap();
731 for (final Span<Term> span : spans) {
732 final Term head = extractHead(document, span);
733 clusters.put(head, Sets.newHashSet(head));
734 extents.put(head, span.getTargets());
735 }
736 for (final Term head : clusters.keySet()) {
737 for (Dep dep = document.getDepToTerm(head); dep != null
738 && ("CONJ".equals(dep.getRfunc()) || "COORD".equals(dep.getRfunc())); dep = document
739 .getDepToTerm(dep.getFrom())) {
740 if (clusters.keySet().contains(dep.getFrom())) {
741 clusters.get(head).add(dep.getFrom());
742 clusters.get(dep.getFrom()).add(head);
743 } else if ("CO".indexOf(dep.getFrom().getPos()) < 0) {
744 break;
745 }
746 }
747 }
748
749
750 final List<Span<Term>> result = Lists.newArrayList();
751 while (!clusters.isEmpty()) {
752 final Set<Term> heads = clusters.values().iterator().next();
753 final Set<Term> terms = Sets.newHashSet();
754 Term spanHead = heads.iterator().next();
755 for (final Term head : heads) {
756 clusters.remove(head);
757 terms.addAll(extents.get(head));
758 final List<Term> path = Lists.newArrayList();
759 for (Dep dep = document.getDepToTerm(head); dep != null; dep = document
760 .getDepToTerm(dep.getFrom())) {
761 final Term term = dep.getFrom();
762 path.add(term);
763 if (heads.contains(term)) {
764 terms.addAll(path);
765 path.clear();
766 spanHead = term;
767 }
768 }
769 }
770 List<Term> spanTerms = Ordering.from(Term.OFFSET_COMPARATOR).sortedCopy(terms);
771 if (canAddTerms) {
772 final List<Term> docTerms = document.getTerms();
773 spanTerms = Lists.newArrayList(docTerms.subList(
774 docTerms.indexOf(spanTerms.get(0)),
775 docTerms.indexOf(spanTerms.get(spanTerms.size() - 1)) + 1));
776 }
777 result.add(KAFDocument.newTermSpan(spanTerms, spanHead));
778 }
779 return result;
780 }
781
782 public static final List<Span<Term>> splitSpans(final KAFDocument document,
783 final Iterable<Span<Term>> spans) {
784
785
786 final Set<Term> heads = Sets.newHashSet();
787 final Set<Term> terms = Sets.newHashSet();
788 for (final Span<Term> span : spans) {
789 final Term head = extractHead(document, span);
790 heads.add(head);
791 terms.addAll(span.getTargets());
792
793 final List<Term> queue = Lists.newLinkedList();
794 queue.add(head);
795 while (!queue.isEmpty()) {
796 final Term term = queue.remove(0);
797 for (final Dep dep : document.getDepsFromTerm(term)) {
798 final String func = dep.getRfunc();
799 if ("COORD".equals(func) || "CONJ".equals(func)) {
800 final Term t = dep.getTo();
801 queue.add(t);
802 if ("CC".equals(t.getMorphofeat())
803 || !Character.isLetter(t.getMorphofeat().charAt(0))) {
804 heads.add(term);
805 }
806 }
807 }
808 }
809 }
810
811
812 final Set<Term> excluded = document.getTermsByDepDescendants(heads);
813 final List<Span<Term>> result = Lists.newArrayList();
814 for (final Term head : heads) {
815 final Set<Term> extent = document.getTermsByDepAncestors(ImmutableSet.of(head));
816 extent.removeAll(excluded);
817 extent.add(head);
818 extent.retainAll(terms);
819 if (!extent.isEmpty()) {
820 result.add(KAFDocument.newTermSpan(Ordering.from(Term.OFFSET_COMPARATOR)
821 .sortedCopy(extent), head));
822 }
823 }
824 return result;
825 }
826
827 public static final List<Span<Term>> splitSpan(final KAFDocument document,
828 final Span<Term> span, final Iterable<Term> heads) {
829
830 final Set<Term> excludedTerms = document.getTermsByDepDescendants(heads);
831 final List<Span<Term>> spans = Lists.newArrayList();
832 for (final Term head : heads) {
833 final Set<Term> terms = document.getTermsByDepAncestors(ImmutableSet.of(head));
834 terms.removeAll(excludedTerms);
835 terms.add(head);
836 terms.retainAll(span.getTargets());
837 if (!terms.isEmpty()) {
838 spans.add(KAFDocument.newTermSpan(Ordering.from(Term.OFFSET_COMPARATOR)
839 .sortedCopy(terms), head));
840 }
841 }
842 return spans;
843 }
844
845
846
847 public static KAFDocument readDocument(@Nullable final Path path) throws IOException {
848 final KAFDocument document;
849 if (path == null) {
850 document = KAFDocument.createFromStream(IO.utf8Reader(IO.buffer(System.in)));
851 document.getPublic().publicId = "";
852 } else {
853 try (BufferedReader reader = Files.newBufferedReader(path)) {
854 document = KAFDocument.createFromStream(reader);
855 document.getPublic().publicId = path.toString();
856 }
857 }
858 return document;
859 }
860
861 public static void writeDocument(final KAFDocument document, @Nullable final Path location)
862 throws IOException {
863 if (location == null) {
864 System.out.write(document.toString().getBytes(Charsets.UTF_8));
865 } else {
866 try (Writer writer = IO.utf8Writer(IO.buffer(IO.write(location.toString())))) {
867 writer.write(document.toString());
868 }
869 }
870 }
871
872
873 public static Term syntacticToSRLHead(final KAFDocument document, final Term term) {
874 for (final Dep dep : document.getDepsFromTerm(term)) {
875 final String func = dep.getRfunc();
876 if ("VC".equals(func) || "IM".equals(func)) {
877 return syntacticToSRLHead(document, dep.getTo());
878 }
879 }
880 return term;
881 }
882
883 public static Term srlToSyntacticHead(final KAFDocument document, final Term term) {
884 final Dep dep = document.getDepToTerm(term);
885 if (dep != null) {
886 final String func = dep.getRfunc();
887 if ("VC".equals(func) || "IM".equals(func)) {
888 return srlToSyntacticHead(document, dep.getFrom());
889 }
890 }
891 return term;
892 }
893
894
895
896 public static String extendedPos(final KAFDocument document, final Term term) {
897 final String pos = term.getMorphofeat();
898 final String lemma = term.getLemma().toLowerCase();
899 if ("some".equals(lemma) || "many".equals(lemma) || "all".equals(lemma)
900 || "few".equals(lemma) || "this".equals(lemma) || "these".equals(lemma)
901 || "that".equals(lemma) || "those".equals(lemma)) {
902 final Dep dep = document.getDepToTerm(term);
903 if (dep == null || !"NMOD".equals(dep.getRfunc())) {
904 return pos + "P";
905 }
906 }
907 return pos;
908 }
909
910 public static Boolean isActiveForm(final KAFDocument document, final Term term) {
911 final String word = term.getStr().toLowerCase();
912 final String pos = term.getMorphofeat();
913 if (!pos.startsWith("V")) {
914 return null;
915 }
916 if (word.equals("been") || !pos.equals("VBN")) {
917 return Boolean.TRUE;
918 }
919 return isActiveFormHelper(document, term);
920 }
921
922 private static Boolean isActiveFormHelper(final KAFDocument document, final Term term) {
923 final Dep dep = document.getDepToTerm(term);
924 if (dep == null) {
925 return Boolean.FALSE;
926 }
927 final Term parent = dep.getFrom();
928 final String word = parent.getStr().toLowerCase();
929 final String pos = parent.getMorphofeat();
930 if (pos.startsWith("NN")) {
931 return Boolean.FALSE;
932 }
933 if (word.matches("am|are|is|was|were|be|been|being")) {
934 return Boolean.FALSE;
935 }
936 if (word.matches("ha(ve|s|d|ving)")) {
937 return Boolean.TRUE;
938 }
939
940 if (pos.matches("VBZ|VBD|VBP|MD")) {
941 return Boolean.FALSE;
942 }
943 return isActiveFormHelper(document, parent);
944 }
945
946 public static java.util.function.Predicate<Term> matchExtendedPos(final KAFDocument document,
947 final String... posPrefixes) {
948 return new java.util.function.Predicate<Term>() {
949
950 @Override
951 public boolean test(final Term term) {
952 final String pos = extendedPos(document, term);
953 for (final String prefix : posPrefixes) {
954 if (pos.startsWith(prefix)) {
955 return true;
956 }
957 }
958 return false;
959 }
960
961 };
962 }
963
964
965 public static Set<Term> getTermsByDepAncestor(final KAFDocument document, final Term head,
966 final boolean consecutive) {
967 final Set<Term> descendants = document.getTermsByDepAncestors(ImmutableSet.of(head));
968 if (consecutive) {
969 final List<Term> sortedTerms = Ordering.from(Term.OFFSET_COMPARATOR).sortedCopy(
970 descendants);
971 final int[] indexes = new int[sortedTerms.size()];
972 for (int i = 0; i < sortedTerms.size(); ++i) {
973 indexes[i] = document.getTerms().indexOf(sortedTerms.get(i));
974 }
975 final int h = sortedTerms.indexOf(head);
976 boolean filtered = false;
977 for (int i = h + 1; i < indexes.length; ++i) {
978 filtered |= indexes[i] > indexes[i - 1] + 1;
979 if (filtered) {
980 descendants.remove(sortedTerms.get(i));
981 }
982 }
983 filtered = false;
984 for (int i = h - 1; i >= 0; --i) {
985 filtered |= indexes[i] < indexes[i + 1] - 1;
986 if (filtered) {
987 descendants.remove(sortedTerms.get(i));
988 }
989 }
990 }
991 return descendants;
992 }
993
994
995 public static IRI createPreMOnSemanticClassIRIfor(String model, String predicate){
996
997 String prefix = "";
998 switch (model) {
999
1000 case RESOURCE_FRAMENET : prefix+=PREMON_FNPREFIX+"-"; break;
1001 case RESOURCE_VERBNET : prefix+=PREMON_VNPREFIX+"-"; break;
1002 case RESOURCE_PROPBANK : prefix+=PREMON_PBPREFIX+"-"; break;
1003 case RESOURCE_NOMBANK : prefix+=PREMON_NBPREFIX+"-"; break;
1004
1005 }
1006
1007
1008 String localname=prefix+predicate.toLowerCase();
1009
1010 return Statements.VALUE_FACTORY.createIRI(PREMON_NAMESPACE, localname);
1011
1012 }
1013
1014
1015 public static IRI createPreMOnSemanticRoleIRIfor(String model, String predicate, String role){
1016
1017 String prefix = "";
1018
1019
1020
1021
1022
1023 switch (model) {
1024 case RESOURCE_FRAMENET : prefix+=PREMON_FNPREFIX+"-";
1025 role=role.toLowerCase();
1026 break;
1027 case RESOURCE_VERBNET : prefix+=PREMON_VNPREFIX+"-";
1028 role=role.toLowerCase();
1029 break;
1030 case RESOURCE_PROPBANK : prefix+=PREMON_PBPREFIX+"-";
1031 role=role.toLowerCase();
1032 if (!role.contains("am-")) role=role.replace("a","arg");
1033 else role=role.replace("am-","arg");
1034 break;
1035 case RESOURCE_NOMBANK : prefix+=PREMON_NBPREFIX+"-";
1036 role=role.toLowerCase();
1037 if (!role.contains("am-")) role=role.replace("a","arg");
1038 else role=role.replace("am-","arg");
1039 break;
1040 }
1041
1042 String localname=prefix+predicate.toLowerCase()+PREMON_ARGUMENT_SEPARATOR+role;
1043
1044 return Statements.VALUE_FACTORY.createIRI(PREMON_NAMESPACE, localname);
1045
1046 }
1047
1048 }