1 package eu.fbk.dkm.pikes.raid;
2
3 import com.github.mustachejava.DefaultMustacheFactory;
4 import com.github.mustachejava.Mustache;
5 import com.google.common.base.Charsets;
6 import com.google.common.base.Preconditions;
7 import com.google.common.base.Splitter;
8 import com.google.common.base.Throwables;
9 import com.google.common.collect.*;
10 import com.google.common.io.Files;
11 import com.google.common.io.Resources;
12 import eu.fbk.dkm.pikes.naflib.Corpus;
13 import eu.fbk.dkm.pikes.naflib.OpinionPrecisionRecall;
14 import eu.fbk.dkm.pikes.rdf.RDFGenerator;
15 import eu.fbk.dkm.pikes.rdf.Renderer;
16 import eu.fbk.dkm.pikes.resources.NAFUtils;
17 import eu.fbk.dkm.pikes.resources.WordNet;
18 import eu.fbk.rdfpro.util.Statements;
19 import eu.fbk.utils.core.CommandLine;
20 import eu.fbk.utils.core.CommandLine.Type;
21 import eu.fbk.utils.core.Range;
22 import eu.fbk.utils.svm.Util;
23 import eu.fbk.dkm.pikes.rdf.vocab.KS_OLD;
24 import eu.fbk.rdfpro.util.IO;
25 import eu.fbk.rdfpro.util.QuadModel;
26 import eu.fbk.rdfpro.util.Tracker;
27 import ixa.kaflib.KAFDocument;
28 import ixa.kaflib.Opinion;
29 import ixa.kaflib.Opinion.Polarity;
30 import ixa.kaflib.Term;
31 import org.eclipse.rdf4j.model.Model;
32 import org.eclipse.rdf4j.model.IRI;
33 import org.slf4j.Logger;
34 import org.slf4j.LoggerFactory;
35
36 import javax.annotation.Nullable;
37 import java.io.File;
38 import java.io.IOException;
39 import java.io.InputStreamReader;
40 import java.io.Writer;
41 import java.nio.file.Path;
42 import java.util.*;
43 import java.util.concurrent.locks.ReadWriteLock;
44 import java.util.concurrent.locks.ReentrantReadWriteLock;
45 import java.util.stream.StreamSupport;
46
47 public final class Analyzer {
48
49 private static final Logger LOGGER = LoggerFactory.getLogger(Analyzer.class);
50
51 private static final Mustache INDEX_TEMPLATE = loadTemplate(Analyzer.class.getSimpleName()
52 + ".index.html");
53
54 private static final Mustache SENTENCE_TEMPLATE = loadTemplate(Analyzer.class.getSimpleName()
55 + ".sentence.html");
56
57 private final Set<String> goldLabels;
58
59 @Nullable
60 private final Set<String> testLabels;
61
62 @Nullable
63 private final Extractor extractor;
64
65 @Nullable
66 private final Path reportPath;
67
68 @Nullable
69 private final List<Map<String, Object>> reportModel;
70
71 @Nullable
72 private final Renderer reportRenderer;
73
74 private final Component[] components;
75
76 private final OpinionPrecisionRecall.Evaluator evaluator;
77
78 private final ReadWriteLock lock;
79
80 private Analyzer(final Iterable<String> goldLabels,
81 @Nullable final Iterable<String> testLabels, @Nullable final Extractor extractor,
82 @Nullable final Path reportPath, final Component... components) {
83
84 Preconditions.checkNotNull(goldLabels);
85 Preconditions.checkArgument(Iterables.size(goldLabels) > 0);
86 Preconditions.checkArgument(extractor != null || testLabels != null);
87 Preconditions.checkArgument(extractor == null || testLabels == null);
88 Preconditions.checkArgument(testLabels == null || Iterables.size(testLabels) > 0);
89
90 Renderer renderer = null;
91 if (reportPath != null) {
92 final List<IRI> nodeTypes = ImmutableList.<IRI>builder()
93 .addAll(Renderer.DEFAULT_NODE_TYPES).add(KS_OLD.ATTRIBUTE).build();
94 final Map<Object, String> colorMap = ImmutableMap.<Object, String>builder()
95 .putAll(Renderer.DEFAULT_COLOR_MAP).build();
96 final Map<Object, String> styleMap = ImmutableMap
97 .<Object, String>builder()
98 .putAll(Renderer.DEFAULT_STYLE_MAP)
99 .put(KS_OLD.ATTRIBUTE, "fontname=\"helvetica-oblique\"")
100 .put(KS_OLD.POSITIVE_OPINION, "fontcolor=green4 fontname=\"helvetica-bold\"")
101 .put(Statements.VALUE_FACTORY.createIRI(KS_OLD.POSITIVE_OPINION + "-from"),
102 "color=green4 fontcolor=green4 penwidth=0.5")
103 .put(KS_OLD.NEGATIVE_OPINION, "fontcolor=red4 fontname=\"helvetica-bold\"")
104 .put(Statements.VALUE_FACTORY.createIRI(KS_OLD.NEGATIVE_OPINION + "-from"),
105 "color=red4 fontcolor=red4 penwidth=0.5")
106 .put(KS_OLD.NEUTRAL_OPINION, "fontcolor=ivory4 fontname=\"helvetica-bold\" ")
107 .put(Statements.VALUE_FACTORY.createIRI(KS_OLD.NEUTRAL_OPINION + "-from"),
108 "color=ivory4 fontcolor=ivory4 penwidth=0.5").build();
109 renderer = Renderer.builder().withNodeTypes(nodeTypes).withColorMap(colorMap)
110 .withStyleMap(styleMap).withNodeNamespaces(ImmutableSet.of()).build();
111 }
112
113 this.goldLabels = Sets.newHashSet(goldLabels);
114 this.testLabels = testLabels == null ? null : Sets.newHashSet(testLabels);
115 this.extractor = extractor;
116 this.reportPath = reportPath;
117 this.reportModel = reportPath == null ? null : Lists.newArrayList();
118 this.reportRenderer = renderer;
119 this.components = ImmutableSet.copyOf(components).toArray(new Component[0]);
120 this.evaluator = OpinionPrecisionRecall.evaluator();
121 this.lock = new ReentrantReadWriteLock();
122 }
123
124 public Analyzer add(final KAFDocument document) {
125
126 this.lock.readLock().lock();
127 try {
128 synchronized (document) {
129 doAdd(document);
130 }
131 } finally {
132 this.lock.readLock().unlock();
133 }
134 return this;
135 }
136
137 public Analyzer add(final Iterable<KAFDocument> documents) {
138
139 this.lock.readLock().lock();
140 try {
141 StreamSupport.stream(documents.spliterator(), true).forEach(document -> {
142 Preconditions.checkNotNull(document);
143 synchronized (document) {
144 doAdd(document);
145 }
146 });
147 } finally {
148 this.lock.readLock().unlock();
149 }
150 return this;
151 }
152
153 public OpinionPrecisionRecall complete() {
154
155 this.lock.writeLock().lock();
156 try {
157 return doComplete();
158 } finally {
159 this.lock.writeLock().unlock();
160 }
161 }
162
163 private synchronized void doAdd(final KAFDocument document) {
164
165
166 final List<Opinion> goldOpinions = Lists.newArrayList();
167 for (final String label : this.goldLabels) {
168 goldOpinions.addAll(document.getOpinions(label));
169 }
170
171
172 final List<Opinion> testOpinions;
173 if (this.extractor != null) {
174 if (this.components.length == Component.values().length) {
175 this.extractor.extract(document, "_test", this.components);
176 } else {
177 this.extractor.refine(document, this.goldLabels, "_test", this.components);
178 }
179 testOpinions = document.getOpinions("_test");
180 } else {
181 testOpinions = Lists.newArrayList();
182 for (final String label : this.testLabels) {
183 testOpinions.addAll(document.getOpinions(label));
184 }
185 }
186
187
188
189
190
191
192 this.evaluator.add(goldOpinions, testOpinions);
193
194
195 if (this.reportPath != null) {
196 final Multimap<Integer, Opinion> goldMap = toMultimap(goldOpinions);
197 final Multimap<Integer, Opinion> testMap = toMultimap(testOpinions);
198 for (int sentenceID = 1; sentenceID <= document.getNumSentences(); ++sentenceID) {
199 if (goldMap.containsKey(sentenceID) || testMap.containsKey(sentenceID)) {
200 final String file = new File(document.getPublic().publicId).getName() + "_"
201 + sentenceID + ".html";
202 final Model model = RDFGenerator.DEFAULT.generate(document,
203 ImmutableList.of(sentenceID));
204 final StringBuilder sentenceMarkup = new StringBuilder();
205 final StringBuilder sentenceParsing = new StringBuilder();
206 final StringBuilder sentenceGraph = new StringBuilder();
207 try {
208 renderOpinions(sentenceMarkup, document, sentenceID,
209 goldMap.get(sentenceID), testMap.get(sentenceID));
210 this.reportRenderer.renderParsing(sentenceParsing, document, model,
211 sentenceID);
212 this.reportRenderer.renderGraph(sentenceGraph, QuadModel.wrap(model),
213 Renderer.Algorithm.NEATO);
214 runTemplate(this.reportPath.resolve(file).toFile(), SENTENCE_TEMPLATE,
215 ImmutableMap.of("markup", sentenceMarkup, "parsing",
216 sentenceParsing, "graph", sentenceGraph));
217 } catch (final IOException ex) {
218 Throwables.propagate(ex);
219 }
220 final Map<String, Object> sentenceModel = Maps.newHashMap();
221 sentenceModel.put("file", file);
222 sentenceModel.put("document", document.getPublic().publicId);
223 sentenceModel.put("sentence", sentenceID);
224 sentenceModel.put("markup", sentenceMarkup);
225 synchronized (this.reportModel) {
226 this.reportModel.add(sentenceModel);
227 }
228 }
229 }
230 }
231 }
232
233 private OpinionPrecisionRecall doComplete() {
234
235
236 if (this.reportPath != null) {
237 try {
238 Collections.sort(this.reportModel, (final Map<String, Object> m1,
239 final Map<String, Object> m2) -> {
240 final String d1 = m1.get("document").toString();
241 final String d2 = m2.get("document").toString();
242 int result = d1.compareTo(d2);
243 if (result == 0) {
244 final int s1 = Integer.parseInt(m1.get("sentence").toString());
245 final int s2 = Integer.parseInt(m2.get("sentence").toString());
246 result = s1 - s2;
247 }
248 return result;
249 });
250 String file = null;
251 int fileCounter = 0;
252 for (final Map<String, Object> map : this.reportModel) {
253 final String curFile = map.get("document").toString();
254 fileCounter += curFile.equals(file) ? 0 : 1;
255 file = curFile;
256 map.put("id", "S" + fileCounter + "." + map.get("sentence"));
257 }
258 runTemplate(this.reportPath.resolve("index.html").toFile(), INDEX_TEMPLATE,
259 ImmutableMap.of("sentences", this.reportModel));
260 final String css = Resources.toString(
261 Analyzer.class.getResource(Analyzer.class.getSimpleName() + ".css"),
262 Charsets.UTF_8);
263 Files.write(css, this.reportPath.resolve("index.css").toFile(), Charsets.UTF_8);
264
265 } catch (final IOException ex) {
266 Throwables.propagate(ex);
267 }
268 }
269
270
271 return this.evaluator.getResult();
272 }
273
274 private static void renderOpinions(final Appendable out, final KAFDocument document,
275 final int sentenceID, final Iterable<Opinion> goldOpinions,
276 final Iterable<Opinion> testOpinions) throws IOException {
277
278
279 final List<Term> sentenceTerms = Ordering.from(Term.OFFSET_COMPARATOR).sortedCopy(
280 document.getSentenceTerms(sentenceID));
281 final Range sentenceRange = Range.enclose(NAFUtils.rangesFor(document, sentenceTerms));
282 final String text = document.getRawText().replace(" ", " ");
283
284
285 final Opinion[][] pairs = Util.align(Opinion.class, goldOpinions, testOpinions, true,
286 true, true, OpinionPrecisionRecall.matcher());
287
288
289 for (final Opinion[] pair : pairs) {
290
291
292 final Opinion goldOpinion = pair[0];
293 final Opinion testOpinion = pair[1];
294
295
296 final Set<Term> headTerms = Sets.newHashSet();
297 final Set<Range> targetGoldRanges = Sets.newHashSet();
298 final Set<Range> targetTestRanges = Sets.newHashSet();
299 final Set<Range> holderGoldRanges = Sets.newHashSet();
300 final Set<Range> holderTestRanges = Sets.newHashSet();
301 final Set<Range> expGoldRanges = Sets.newHashSet();
302 final Set<Range> expTestRanges = Sets.newHashSet();
303 Polarity goldPolarity = null;
304 Polarity testPolarity = null;
305
306
307 if (goldOpinion != null) {
308 if (goldOpinion.getOpinionTarget() != null) {
309 final List<Term> t = goldOpinion.getOpinionTarget().getSpan().getTargets();
310 targetGoldRanges.addAll(NAFUtils.rangesFor(document, t));
311 headTerms.addAll(NAFUtils.extractHeads(document, null, t, NAFUtils
312 .matchExtendedPos(document, "NN", "PRP", "JJP", "DTP", "WP", "VB")));
313 }
314 if (goldOpinion.getOpinionHolder() != null) {
315 final List<Term> h = goldOpinion.getOpinionHolder().getSpan().getTargets();
316 holderGoldRanges.addAll(NAFUtils.rangesFor(document, h));
317 headTerms.addAll(NAFUtils.extractHeads(document, null, h,
318 NAFUtils.matchExtendedPos(document, "NN", "PRP", "JJP", "DTP", "WP")));
319 }
320 if (goldOpinion.getOpinionExpression() != null) {
321 final List<Term> e = goldOpinion.getOpinionExpression().getSpan().getTargets();
322 expGoldRanges.addAll(NAFUtils.rangesFor(document, e));
323 headTerms.addAll(NAFUtils.extractHeads(document, null, e,
324 NAFUtils.matchExtendedPos(document, "NN", "VB", "JJ", "R")));
325 goldPolarity = Polarity.forExpression(goldOpinion.getOpinionExpression());
326 }
327 }
328
329
330 if (testOpinion != null) {
331 if (testOpinion.getOpinionTarget() != null) {
332 final List<Term> t = testOpinion.getOpinionTarget().getSpan().getTargets();
333 targetTestRanges.addAll(NAFUtils.rangesFor(document, t));
334 }
335 if (testOpinion.getOpinionHolder() != null) {
336 final List<Term> h = testOpinion.getOpinionHolder().getSpan().getTargets();
337 holderTestRanges.addAll(NAFUtils.rangesFor(document, h));
338 }
339 if (testOpinion.getOpinionExpression() != null) {
340 final List<Term> e = testOpinion.getOpinionExpression().getSpan().getTargets();
341 expTestRanges.addAll(NAFUtils.rangesFor(document, e));
342 testPolarity = Polarity.forExpression(testOpinion.getOpinionExpression());
343 }
344 }
345
346
347 final List<Range> headRanges = NAFUtils.rangesFor(document, headTerms);
348 @SuppressWarnings("unchecked")
349 final List<Range> ranges = sentenceRange.split(ImmutableSet.copyOf(Iterables
350 .<Range>concat(targetGoldRanges, targetTestRanges, holderGoldRanges,
351 holderTestRanges, expGoldRanges, expTestRanges, headRanges)));
352
353
354 out.append("<p class=\"opinion\">");
355 out.append("<span class=\"opinion-id\" title=\"Test label: ")
356 .append(testOpinion == null ? "-" : testOpinion.getLabel())
357 .append(", gold label: ")
358 .append(goldOpinion == null ? "-" : goldOpinion.getLabel()).append("\">")
359 .append(testOpinion == null ? "-" : testOpinion.getId()).append(" / ")
360 .append(goldOpinion == null ? "-" : goldOpinion.getId()).append("</span>");
361 for (final Range range : ranges) {
362 final boolean targetGold = range.containedIn(targetGoldRanges);
363 final boolean targetTest = range.containedIn(targetTestRanges);
364 final boolean holderGold = range.containedIn(holderGoldRanges);
365 final boolean holderTest = range.containedIn(holderTestRanges);
366 final boolean expGold = range.containedIn(expGoldRanges);
367 final boolean expTest = range.containedIn(expTestRanges);
368 final boolean head = range.containedIn(headRanges);
369 int spans = 0;
370 if (holderGold || holderTest) {
371 ++spans;
372 final String css = (holderGold ? "hg" : "") + " " + (holderTest ? "ht" : "");
373 out.append("<span class=\"").append(css).append("\">");
374 }
375 if (targetGold || targetTest) {
376 ++spans;
377 final String css = (targetGold ? "tg" : "") + " " + (targetTest ? "tt" : "");
378 out.append("<span class=\"").append(css).append("\">");
379 }
380 if (expGold || expTest) {
381 ++spans;
382 final String css = (expGold ? "eg" + goldPolarity.ordinal() : "") + " "
383 + (expTest ? "et" + testPolarity.ordinal() : "");
384 out.append("<span class=\"").append(css).append("\">");
385 }
386 if (head) {
387 ++spans;
388 out.append("<span class=\"head\">");
389 }
390 out.append(text.substring(range.begin(), range.end()));
391 for (int i = 0; i < spans; ++i) {
392 out.append("</span>");
393 }
394 }
395 out.append("</p>");
396 }
397 }
398
399 private static Multimap<Integer, Opinion> toMultimap(final Iterable<Opinion> opinions) {
400 final Multimap<Integer, Opinion> map = HashMultimap.create();
401 for (final Opinion opinion : opinions) {
402 int sentenceID = 1;
403 if (opinion.getExpressionSpan() != null && !opinion.getExpressionSpan().isEmpty()) {
404 sentenceID = opinion.getExpressionSpan().getTargets().get(0).getSent();
405 } else if (opinion.getHolderSpan() != null && !opinion.getHolderSpan().isEmpty()) {
406 sentenceID = opinion.getHolderSpan().getTargets().get(0).getSent();
407 } else if (opinion.getTargetSpan() != null && !opinion.getTargetSpan().isEmpty()) {
408 sentenceID = opinion.getHolderSpan().getTargets().get(0).getSent();
409 }
410 map.put(sentenceID, opinion);
411 }
412 return map;
413 }
414
415 private static Mustache loadTemplate(final String name) {
416 try {
417 final DefaultMustacheFactory factory = new DefaultMustacheFactory();
418 return factory.compile(new InputStreamReader(Analyzer.class.getResource(name)
419 .openStream(), Charsets.UTF_8), name);
420 } catch (final IOException ex) {
421 throw new Error(ex);
422 }
423 }
424
425 private static void runTemplate(final File file, final Mustache template, final Object model)
426 throws IOException {
427 try (Writer writer = IO.utf8Writer(IO.buffer(IO.write(file.getAbsolutePath())))) {
428 template.execute(writer, model);
429 }
430 }
431
432 public static Analyzer create(final Iterable<String> goldLabels,
433 final Iterable<String> testLabels, @Nullable final Path reportPath,
434 final Component... components) {
435 return new Analyzer(goldLabels, testLabels, null, reportPath, components);
436 }
437
438 public static Analyzer create(final Iterable<String> goldLabels, final Extractor extractor,
439 @Nullable final Path reportPath, final Component... components) {
440 return new Analyzer(goldLabels, null, extractor, reportPath, components);
441 }
442
443 public static void main(final String... args) {
444
445 try {
446
447 final CommandLine cmd = CommandLine
448 .parser()
449 .withName("fssa-analyze")
450 .withHeader(
451 "Analyze the output of an opinion extractor, "
452 + "possibly emitting a per-sentence HTML report.")
453 .withOption("p", "properties", "a sequence of key=value properties, used to "
454 + "select and configure the trainer", "PROPS", Type.STRING, true,
455 false, false)
456 .withOption("c", "components", "the opinion components to consider: "
457 + "(e)xpression, (h)older, (t)arget, (p)olarity", "COMP", Type.STRING,
458 true, false, false)
459 .withOption("l", "labels", "the labels of gold opinions to consider, comma "
460 + "separated (no spaces)", "LABELS", Type.STRING, true, false, false)
461 .withOption("b", "test-labels",
462 "the labels of pre-existing test opinions to consider, comma "
463 + "separated (no spaces)", "LABELS", Type.STRING, true,
464 false, false)
465 .withOption("m", "model",
466 "the extractor model, in case opinion extraction is done on the fly",
467 "FILE", Type.FILE_EXISTING, true, false, false)
468 .withOption("r", "recursive",
469 "recurse into subdirectories of specified input paths")
470 .withOption("@", "list",
471 "interprets input as list of file names, one per line")
472 .withOption("o", "output", "the output path where to emit optional reports",
473 "DIR", Type.DIRECTORY, true, false, false)
474 .withOption(null, "wordnet", "wordnet dict path", "PATH",
475 Type.DIRECTORY_EXISTING, true, false, false)
476 .withFooter(
477 "Zero or more input paths can be specified, corresponding either "
478 + "to NAF files or directories that are scanned for NAF "
479 + "files. If the list is empty, an input NAF file will be "
480 + "read from the standard input. Exactly one option among -m "
481 + "and -b must be specified.")
482 .withLogger(LoggerFactory.getLogger("eu.fbk"))
483 .parse(args);
484
485
486 final Properties properties = Util.parseProperties(cmd.getOptionValue("p",
487 String.class, ""));
488 final Component[] components = Component.forLetters(
489 cmd.getOptionValue("c", String.class, "")).toArray(new Component[0]);
490 final Set<String> goldLabels = ImmutableSet.copyOf(Splitter.on(',').omitEmptyStrings()
491 .split(cmd.getOptionValue("l", String.class, "")));
492 final Set<String> testLabels = ImmutableSet.copyOf(Splitter.on(',').omitEmptyStrings()
493 .split(cmd.getOptionValue("b", String.class, "")));
494 final Path modelPath = cmd.getOptionValue("m", Path.class);
495 final boolean recursive = cmd.hasOption("r");
496 final boolean list = cmd.hasOption("@");
497 final Path outputPath = cmd.getOptionValue("o", Path.class, null);
498 final List<Path> inputPaths = Lists.newArrayList(cmd.getArgs(Path.class));
499 if (!testLabels.isEmpty() && modelPath != null) {
500 throw new IllegalArgumentException("Both option -m and -b were specified");
501 }
502
503 final String wordnetPath = cmd.getOptionValue("wordnet", String.class);
504 if (wordnetPath != null) {
505 WordNet.setPath(wordnetPath);
506 }
507
508
509 final List<Path> files = Util.fileMatch(inputPaths, ImmutableList.of(".naf",
510 ".naf.gz", ".naf.bz2", ".naf.xz", ".xml", ".xml.gz", ".xml.bz2", ".xml.xz"),
511 recursive, list);
512 final Iterable<KAFDocument> documents = files != null ? Corpus.create(false, files)
513 : ImmutableList.of(NAFUtils.readDocument(null));
514
515
516 final Extractor extractor = modelPath == null ? null
517 : Extractor.readFrom(modelPath, properties);
518
519
520 final Analyzer analyzer = extractor != null ? create(goldLabels, extractor,
521 outputPath, components) : create(goldLabels, testLabels, outputPath,
522 components);
523
524
525 final Tracker tracker = new Tracker(LOGGER, null,
526 "Processed %d NAF files (%d NAF/s avg)",
527 "Processed %d NAF files (%d NAF/s, %d NAF/s avg)");
528 tracker.start();
529 StreamSupport.stream(documents.spliterator(), false).forEach(
530 (final KAFDocument document) -> {
531 analyzer.add(document);
532 tracker.increment();
533 });
534 tracker.end();
535
536
537 final OpinionPrecisionRecall opr = analyzer.complete();
538 LOGGER.info("Measured performances:\n{}", opr);
539
540 } catch (final Throwable ex) {
541 CommandLine.fail(ex);
542 }
543 }
544
545 }