1 package eu.fbk.dkm.pikes.raid.pipeline;
2
3 import java.io.IOException;
4 import java.nio.file.Files;
5 import java.nio.file.Path;
6 import java.util.EnumSet;
7 import java.util.Iterator;
8 import java.util.List;
9 import java.util.Map;
10 import java.util.Properties;
11 import java.util.Set;
12
13 import javax.annotation.Nullable;
14
15 import com.google.common.base.Preconditions;
16 import com.google.common.collect.ImmutableList;
17 import com.google.common.collect.ImmutableSet;
18 import com.google.common.collect.Iterables;
19 import com.google.common.collect.Lists;
20 import com.google.common.collect.Maps;
21 import com.google.common.collect.Ordering;
22 import com.google.common.collect.Sets;
23
24 import ixa.kaflib.Dep;
25 import ixa.kaflib.KAFDocument;
26 import ixa.kaflib.Opinion;
27 import ixa.kaflib.Opinion.Polarity;
28 import ixa.kaflib.Span;
29 import ixa.kaflib.Term;
30
31 import eu.fbk.dkm.pikes.raid.Component;
32 import eu.fbk.dkm.pikes.raid.Extractor;
33 import eu.fbk.dkm.pikes.raid.Opinions;
34 import eu.fbk.dkm.pikes.resources.NAFFilter;
35 import eu.fbk.dkm.pikes.resources.NAFUtils;
36
37 public final class PipelineExtractor extends Extractor {
38
39 @Nullable
40 private final LinkLabeller holderLinkLabeller;
41
42 @Nullable
43 private final LinkLabeller targetLinkLabeller;
44
45 @Nullable
46 private final SpanLabeller holderSpanLabeller;
47
48 @Nullable
49 private final SpanLabeller targetSpanLabeller;
50
51 private final boolean holderUnique;
52
53 private final boolean targetUnique;
54
55 private final NAFFilter filter;
56
57 protected PipelineExtractor(final Properties properties, final Path path) throws IOException {
58 this(Files.exists(path.resolve("holder-link")) ? LinkLabeller.readFrom(path
59 .resolve("holder-link")) : null,
60 Files.exists(path.resolve("target-link")) ? LinkLabeller.readFrom(path
61 .resolve("target-link")) : null,
62 Files.exists(path.resolve("holder-span")) ? SpanLabeller.readFrom(path
63 .resolve("holder-span")) : null,
64 Files.exists(path.resolve("target-span")) ? SpanLabeller.readFrom(path
65 .resolve("target-span")) : null,
66 Boolean.parseBoolean(properties.getProperty("holder.unique", "false")),
67 Boolean.parseBoolean(properties.getProperty("target.unique", "false")));
68 }
69
70 protected PipelineExtractor(@Nullable final LinkLabeller holderLinkLabeller,
71 @Nullable final LinkLabeller targetLinkLabeller,
72 @Nullable final SpanLabeller holderSpanLabeller,
73 @Nullable final SpanLabeller targetSpanLabeller, final boolean holderUnique,
74 final boolean targetUnique) {
75
76 Preconditions.checkArgument(holderLinkLabeller == null == (holderSpanLabeller == null));
77 Preconditions.checkArgument(targetLinkLabeller == null == (targetSpanLabeller == null));
78
79 this.holderLinkLabeller = holderLinkLabeller;
80 this.targetLinkLabeller = targetLinkLabeller;
81 this.holderSpanLabeller = holderSpanLabeller;
82 this.targetSpanLabeller = targetSpanLabeller;
83 this.holderUnique = holderUnique;
84 this.targetUnique = targetUnique;
85
86 this.filter = NAFFilter.builder(false).withTermSenseCompletion(true)
87 .withEntityAddition(true).withEntityRemoveOverlaps(true)
88 .withEntitySpanFixing(true).withSRLPredicateAddition(true)
89 .withSRLRemoveWrongRefs(true).withSRLSelfArgFixing(true).build();
90 }
91
92 @Override
93 protected void doFilter(final KAFDocument document) {
94 this.filter.accept(document);
95 }
96
97 @SuppressWarnings("unchecked")
98 @Override
99 protected Iterable<Opinion> doExtract(final KAFDocument document, final int sentence,
100 final EnumSet<Component> components) {
101
102
103 final List<Opinion> opinions = Lists.newArrayList();
104 for (final Span<Term> expressionSpan : findExpressions(document, sentence)) {
105
106
107 final Term expressionHead = Ordering.from(Term.OFFSET_COMPARATOR).max(
108 Opinions.heads(document, NAFUtils.normalizeSpan(document, expressionSpan),
109 Component.EXPRESSION));
110
111
112 Polarity polarity = null;
113 if (components.contains(Component.POLARITY)) {
114 polarity = findPolarity(expressionSpan);
115 }
116
117
118 final List<Span<Term>> holderSpans = Lists.newArrayList((Span<Term>) null);
119 if (components.contains(Component.HOLDER) && this.holderLinkLabeller != null) {
120 Iterables.addAll(holderSpans, findArguments(document, sentence, expressionHead,
121 this.holderLinkLabeller, this.holderSpanLabeller, this.holderUnique));
122 }
123
124
125 final List<Span<Term>> targetSpans = Lists.newArrayList((Span<Term>) null);
126 if (components.contains(Component.TARGET) && this.targetLinkLabeller != null) {
127 findArguments(document, sentence, expressionHead, this.targetLinkLabeller,
128 this.targetSpanLabeller, this.targetUnique);
129 }
130
131
132 opinions.addAll(Opinions.create(document, expressionSpan, holderSpans, targetSpans,
133 polarity));
134 }
135 return opinions;
136 }
137
138 @SuppressWarnings({ "unchecked" })
139 @Override
140 protected Iterable<Opinion> doRefine(final KAFDocument document, final int sentence,
141 final EnumSet<Component> components, final Opinion opinion) {
142
143
144 Polarity polarity = Polarity.forOpinion(opinion);
145 if (components.contains(Component.POLARITY)) {
146 polarity = findPolarity(opinion.getExpressionSpan());
147 }
148
149
150 final List<Span<Term>> holderSpans = Lists.newArrayList((Span<Term>) null);
151 final List<Span<Term>> targetSpans = Lists.newArrayList((Span<Term>) null);
152 final Set<Term> expressionHeads = Opinions.heads(document,
153 NAFUtils.normalizeSpan(document, opinion.getExpressionSpan()),
154 Component.EXPRESSION);
155
156
157 if (!expressionHeads.isEmpty()) {
158
159
160 final Term expressionHead = Ordering.from(Term.OFFSET_COMPARATOR).max(expressionHeads);
161
162
163 if (components.contains(Component.HOLDER) && this.holderLinkLabeller != null) {
164 Iterables.addAll(holderSpans, findArguments(document, sentence, expressionHead,
165 this.holderLinkLabeller, this.holderSpanLabeller, this.holderUnique));
166 } else if (opinion.getHolderSpan() != null) {
167 holderSpans.add(opinion.getHolderSpan());
168 }
169
170
171 if (components.contains(Component.TARGET) && this.targetLinkLabeller != null) {
172 Iterables.addAll(targetSpans, findArguments(document, sentence, expressionHead,
173 this.targetLinkLabeller, this.targetSpanLabeller, this.targetUnique));
174 } else if (opinion.getTargetSpan() != null) {
175 targetSpans.add(opinion.getTargetSpan());
176 }
177 }
178
179
180 return Opinions.create(document, opinion.getExpressionSpan(), holderSpans, targetSpans,
181 polarity);
182 }
183
184 @Override
185 protected void doWrite(final Properties properties, final Path path) throws IOException {
186
187
188
189 if (this.holderLinkLabeller != null) {
190 this.holderLinkLabeller.writeTo(path.resolve("holder-link"));
191 this.holderSpanLabeller.writeTo(path.resolve("holder-span"));
192 }
193 if (this.targetLinkLabeller != null) {
194 this.targetLinkLabeller.writeTo(path.resolve("target-link"));
195 this.targetSpanLabeller.writeTo(path.resolve("target-span"));
196 }
197
198 properties.setProperty("holder.unique", Boolean.toString(this.holderUnique));
199 properties.setProperty("target.unique", Boolean.toString(this.targetUnique));
200 }
201
202 private Iterable<Span<Term>> findExpressions(final KAFDocument document, final int sentence) {
203
204 return ImmutableList.of();
205 }
206
207 private List<Span<Term>> findArguments(final KAFDocument document, final int sentence,
208 final Term expressionHead, final LinkLabeller linkLabeller,
209 final SpanLabeller spanLabeller, final boolean unique) {
210
211 final Map<Term, Float> map = linkLabeller.label(document, expressionHead);
212
213 final Set<Term> blockedTerms = document.getTermsByDepDescendants(ImmutableSet
214 .of(expressionHead));
215 final Map<Term, Set<Term>> clusters = Maps.newHashMap();
216 for (final Term term : document.getTermsBySent(expressionHead.getSent())) {
217 clusters.put(term, ImmutableSet.of(term));
218 }
219 List<Dep> deps = document.getDepsBySent(expressionHead.getSent());
220 deps = deps != null ? deps : Lists.newArrayList();
221 for (final Dep dep : deps) {
222 if ("COORD".equals(dep.getRfunc()) || "CONJ".equals(dep.getRfunc())) {
223 if (blockedTerms.contains(dep.getFrom()) || blockedTerms.contains(dep.getTo())) {
224 continue;
225 }
226 final Set<Term> fromCluster = clusters.get(dep.getFrom());
227 final Set<Term> toCluster = clusters.get(dep.getTo());
228 final Set<Term> mergedCluster = ImmutableSet.copyOf(Sets.union(fromCluster,
229 toCluster));
230 for (final Term term : mergedCluster) {
231 clusters.put(term, mergedCluster);
232 }
233 }
234 }
235 Float bestScore = Float.MIN_VALUE;
236 Set<Term> bestCluster = null;
237 while (!clusters.isEmpty()) {
238 final Set<Term> cluster = clusters.values().iterator().next();
239 clusters.keySet().removeAll(cluster);
240 float score = 1.0f;
241 int count = 0;
242 for (final Term term : cluster) {
243 final Float s = map.get(term);
244 if (s != null) {
245 ++count;
246 score = Math.min(score, s);
247 }
248 }
249 if (count > 0) {
250 for (final Term term : cluster) {
251 if ("CO".indexOf(term.getPos().charAt(0)) < 0) {
252 map.put(term, score);
253 }
254 }
255 if (bestCluster == null || score >= bestScore) {
256 bestScore = score;
257 bestCluster = cluster;
258 }
259 }
260 }
261 for (final Iterator<Map.Entry<Term, Float>> i = map.entrySet().iterator(); i.hasNext();) {
262 if (i.next().getValue() < 0.5f) {
263 i.remove();
264 }
265 }
266 if (bestCluster != null) {
267 map.keySet().retainAll(bestCluster);
268 }
269
270 final List<Span<Term>> argSpans = Lists.newArrayList();
271 for (final Term argHead : map.keySet()) {
272 final List<Term> excludedTerms = Lists.newArrayList(map.keySet());
273 excludedTerms.remove(argHead);
274 argSpans.add(spanLabeller.expand(document, argHead, excludedTerms));
275 }
276
277
278
279 List<Span<Term>> spans = argSpans;
280 if (unique && spans.size() > 1) {
281 spans = NAFUtils.mergeSpans(document, spans, unique);
282 if (spans.size() > 1) {
283 final Set<Term> terms = Sets.newHashSet();
284 for (final Span<Term> span : spans) {
285 terms.addAll(span.getTargets());
286 }
287 spans = ImmutableList.of(KAFDocument.newTermSpan(Ordering.from(
288 Term.OFFSET_COMPARATOR).sortedCopy(terms)));
289 }
290 }
291 return spans;
292 }
293
294 private Polarity findPolarity(final Span<Term> span) {
295
296 return null;
297 }
298
299 }