1   package eu.fbk.dkm.pikes.raid.pipeline;
2   
3   import java.io.IOException;
4   import java.nio.file.Files;
5   import java.nio.file.Path;
6   import java.util.EnumSet;
7   import java.util.Iterator;
8   import java.util.List;
9   import java.util.Map;
10  import java.util.Properties;
11  import java.util.Set;
12  
13  import javax.annotation.Nullable;
14  
15  import com.google.common.base.Preconditions;
16  import com.google.common.collect.ImmutableList;
17  import com.google.common.collect.ImmutableSet;
18  import com.google.common.collect.Iterables;
19  import com.google.common.collect.Lists;
20  import com.google.common.collect.Maps;
21  import com.google.common.collect.Ordering;
22  import com.google.common.collect.Sets;
23  
24  import ixa.kaflib.Dep;
25  import ixa.kaflib.KAFDocument;
26  import ixa.kaflib.Opinion;
27  import ixa.kaflib.Opinion.Polarity;
28  import ixa.kaflib.Span;
29  import ixa.kaflib.Term;
30  
31  import eu.fbk.dkm.pikes.raid.Component;
32  import eu.fbk.dkm.pikes.raid.Extractor;
33  import eu.fbk.dkm.pikes.raid.Opinions;
34  import eu.fbk.dkm.pikes.resources.NAFFilter;
35  import eu.fbk.dkm.pikes.resources.NAFUtils;
36  
37  public final class PipelineExtractor extends Extractor {
38  
39      @Nullable
40      private final LinkLabeller holderLinkLabeller;
41  
42      @Nullable
43      private final LinkLabeller targetLinkLabeller;
44  
45      @Nullable
46      private final SpanLabeller holderSpanLabeller;
47  
48      @Nullable
49      private final SpanLabeller targetSpanLabeller;
50  
51      private final boolean holderUnique;
52  
53      private final boolean targetUnique;
54  
55      private final NAFFilter filter;
56  
57      protected PipelineExtractor(final Properties properties, final Path path) throws IOException {
58          this(Files.exists(path.resolve("holder-link")) ? LinkLabeller.readFrom(path
59                  .resolve("holder-link")) : null,
60                  Files.exists(path.resolve("target-link")) ? LinkLabeller.readFrom(path
61                          .resolve("target-link")) : null,
62                  Files.exists(path.resolve("holder-span")) ? SpanLabeller.readFrom(path
63                          .resolve("holder-span")) : null,
64                  Files.exists(path.resolve("target-span")) ? SpanLabeller.readFrom(path
65                          .resolve("target-span")) : null, //
66                  Boolean.parseBoolean(properties.getProperty("holder.unique", "false")), //
67                  Boolean.parseBoolean(properties.getProperty("target.unique", "false")));
68      }
69  
70      protected PipelineExtractor(@Nullable final LinkLabeller holderLinkLabeller,
71              @Nullable final LinkLabeller targetLinkLabeller,
72              @Nullable final SpanLabeller holderSpanLabeller,
73              @Nullable final SpanLabeller targetSpanLabeller, final boolean holderUnique,
74              final boolean targetUnique) {
75  
76          Preconditions.checkArgument(holderLinkLabeller == null == (holderSpanLabeller == null));
77          Preconditions.checkArgument(targetLinkLabeller == null == (targetSpanLabeller == null));
78  
79          this.holderLinkLabeller = holderLinkLabeller;
80          this.targetLinkLabeller = targetLinkLabeller;
81          this.holderSpanLabeller = holderSpanLabeller;
82          this.targetSpanLabeller = targetSpanLabeller;
83          this.holderUnique = holderUnique;
84          this.targetUnique = targetUnique;
85  
86          this.filter = NAFFilter.builder(false).withTermSenseCompletion(true)
87                  .withEntityAddition(true).withEntityRemoveOverlaps(true)
88                  .withEntitySpanFixing(true).withSRLPredicateAddition(true)
89                  .withSRLRemoveWrongRefs(true).withSRLSelfArgFixing(true).build();
90      }
91  
92      @Override
93      protected void doFilter(final KAFDocument document) {
94          this.filter.accept(document);
95      }
96  
97      @SuppressWarnings("unchecked")
98      @Override
99      protected Iterable<Opinion> doExtract(final KAFDocument document, final int sentence,
100             final EnumSet<Component> components) {
101 
102         // Extract expressions and, for each of them, their holders and targets
103         final List<Opinion> opinions = Lists.newArrayList();
104         for (final Span<Term> expressionSpan : findExpressions(document, sentence)) {
105 
106             // Identify the expression head
107             final Term expressionHead = Ordering.from(Term.OFFSET_COMPARATOR).max(
108                     Opinions.heads(document, NAFUtils.normalizeSpan(document, expressionSpan),
109                             Component.EXPRESSION));
110 
111             // Find the polarity, if enabled
112             Polarity polarity = null;
113             if (components.contains(Component.POLARITY)) {
114                 polarity = findPolarity(expressionSpan);
115             }
116 
117             // Find holders, if enabled
118             final List<Span<Term>> holderSpans = Lists.newArrayList((Span<Term>) null);
119             if (components.contains(Component.HOLDER) && this.holderLinkLabeller != null) {
120                 Iterables.addAll(holderSpans, findArguments(document, sentence, expressionHead, //
121                         this.holderLinkLabeller, this.holderSpanLabeller, this.holderUnique));
122             }
123 
124             // Find targets, if enabled
125             final List<Span<Term>> targetSpans = Lists.newArrayList((Span<Term>) null);
126             if (components.contains(Component.TARGET) && this.targetLinkLabeller != null) {
127                 findArguments(document, sentence, expressionHead, this.targetLinkLabeller,
128                         this.targetSpanLabeller, this.targetUnique);
129             }
130 
131             // Emit opinions
132             opinions.addAll(Opinions.create(document, expressionSpan, holderSpans, targetSpans,
133                     polarity));
134         }
135         return opinions;
136     }
137 
138     @SuppressWarnings({ "unchecked" })
139     @Override
140     protected Iterable<Opinion> doRefine(final KAFDocument document, final int sentence,
141             final EnumSet<Component> components, final Opinion opinion) {
142 
143         // Find the polarity
144         Polarity polarity = Polarity.forOpinion(opinion);
145         if (components.contains(Component.POLARITY)) {
146             polarity = findPolarity(opinion.getExpressionSpan());
147         }
148 
149         // Retrieve the expression head
150         final List<Span<Term>> holderSpans = Lists.newArrayList((Span<Term>) null);
151         final List<Span<Term>> targetSpans = Lists.newArrayList((Span<Term>) null);
152         final Set<Term> expressionHeads = Opinions.heads(document,
153                 NAFUtils.normalizeSpan(document, opinion.getExpressionSpan()),
154                 Component.EXPRESSION);
155 
156         // Retrieve holders and targets only if head is defined
157         if (!expressionHeads.isEmpty()) {
158 
159             // Take one head
160             final Term expressionHead = Ordering.from(Term.OFFSET_COMPARATOR).max(expressionHeads);
161 
162             // Find holders
163             if (components.contains(Component.HOLDER) && this.holderLinkLabeller != null) {
164                 Iterables.addAll(holderSpans, findArguments(document, sentence, expressionHead, //
165                         this.holderLinkLabeller, this.holderSpanLabeller, this.holderUnique));
166             } else if (opinion.getHolderSpan() != null) {
167                 holderSpans.add(opinion.getHolderSpan());
168             }
169 
170             // Find targets
171             if (components.contains(Component.TARGET) && this.targetLinkLabeller != null) {
172                 Iterables.addAll(targetSpans, findArguments(document, sentence, expressionHead, //
173                         this.targetLinkLabeller, this.targetSpanLabeller, this.targetUnique));
174             } else if (opinion.getTargetSpan() != null) {
175                 targetSpans.add(opinion.getTargetSpan());
176             }
177         }
178 
179         // Emit opinions
180         return Opinions.create(document, opinion.getExpressionSpan(), holderSpans, targetSpans,
181                 polarity);
182     }
183 
184     @Override
185     protected void doWrite(final Properties properties, final Path path) throws IOException {
186 
187         // TODO: Alessio
188 
189         if (this.holderLinkLabeller != null) {
190             this.holderLinkLabeller.writeTo(path.resolve("holder-link"));
191             this.holderSpanLabeller.writeTo(path.resolve("holder-span"));
192         }
193         if (this.targetLinkLabeller != null) {
194             this.targetLinkLabeller.writeTo(path.resolve("target-link"));
195             this.targetSpanLabeller.writeTo(path.resolve("target-span"));
196         }
197 
198         properties.setProperty("holder.unique", Boolean.toString(this.holderUnique));
199         properties.setProperty("target.unique", Boolean.toString(this.targetUnique));
200     }
201 
202     private Iterable<Span<Term>> findExpressions(final KAFDocument document, final int sentence) {
203         // TODO: Alessio
204         return ImmutableList.of();
205     }
206 
207     private List<Span<Term>> findArguments(final KAFDocument document, final int sentence,
208             final Term expressionHead, final LinkLabeller linkLabeller,
209             final SpanLabeller spanLabeller, final boolean unique) {
210 
211         final Map<Term, Float> map = linkLabeller.label(document, expressionHead);
212 
213         final Set<Term> blockedTerms = document.getTermsByDepDescendants(ImmutableSet
214                 .of(expressionHead));
215         final Map<Term, Set<Term>> clusters = Maps.newHashMap();
216         for (final Term term : document.getTermsBySent(expressionHead.getSent())) {
217             clusters.put(term, ImmutableSet.of(term));
218         }
219         List<Dep> deps = document.getDepsBySent(expressionHead.getSent());
220         deps = deps != null ? deps : Lists.newArrayList();
221         for (final Dep dep : deps) {
222             if ("COORD".equals(dep.getRfunc()) || "CONJ".equals(dep.getRfunc())) {
223                 if (blockedTerms.contains(dep.getFrom()) || blockedTerms.contains(dep.getTo())) {
224                     continue;
225                 }
226                 final Set<Term> fromCluster = clusters.get(dep.getFrom());
227                 final Set<Term> toCluster = clusters.get(dep.getTo());
228                 final Set<Term> mergedCluster = ImmutableSet.copyOf(Sets.union(fromCluster,
229                         toCluster));
230                 for (final Term term : mergedCluster) {
231                     clusters.put(term, mergedCluster);
232                 }
233             }
234         }
235         Float bestScore = Float.MIN_VALUE;
236         Set<Term> bestCluster = null;
237         while (!clusters.isEmpty()) {
238             final Set<Term> cluster = clusters.values().iterator().next();
239             clusters.keySet().removeAll(cluster);
240             float score = 1.0f; // was 0;
241             int count = 0;
242             for (final Term term : cluster) {
243                 final Float s = map.get(term);
244                 if (s != null) {
245                     ++count;
246                     score = Math.min(score, s); // was max
247                 }
248             }
249             if (count > 0) {
250                 for (final Term term : cluster) {
251                     if ("CO".indexOf(term.getPos().charAt(0)) < 0) {
252                         map.put(term, score);
253                     }
254                 }
255                 if (bestCluster == null || score >= bestScore) {
256                     bestScore = score;
257                     bestCluster = cluster;
258                 }
259             }
260         }
261         for (final Iterator<Map.Entry<Term, Float>> i = map.entrySet().iterator(); i.hasNext();) {
262             if (i.next().getValue() < 0.5f) {
263                 i.remove();
264             }
265         }
266         if (bestCluster != null) {
267             map.keySet().retainAll(bestCluster);
268         }
269 
270         final List<Span<Term>> argSpans = Lists.newArrayList();
271         for (final Term argHead : map.keySet()) {
272             final List<Term> excludedTerms = Lists.newArrayList(map.keySet());
273             excludedTerms.remove(argHead);
274             argSpans.add(spanLabeller.expand(document, argHead, excludedTerms));
275         }
276 
277         // If a unique span is required, we add missing terms between found spans so to get a span
278         // of consecutive terms
279         List<Span<Term>> spans = argSpans;
280         if (unique && spans.size() > 1) {
281             spans = NAFUtils.mergeSpans(document, spans, unique);
282             if (spans.size() > 1) {
283                 final Set<Term> terms = Sets.newHashSet();
284                 for (final Span<Term> span : spans) {
285                     terms.addAll(span.getTargets());
286                 }
287                 spans = ImmutableList.of(KAFDocument.newTermSpan(Ordering.from(
288                         Term.OFFSET_COMPARATOR).sortedCopy(terms)));
289             }
290         }
291         return spans;
292     }
293 
294     private Polarity findPolarity(final Span<Term> span) {
295         // TODO: Mauro
296         return null;
297     }
298 
299 }