1 package eu.fbk.dkm.pikes.raid;
2
3 import com.google.common.base.Preconditions;
4 import com.google.common.base.Splitter;
5 import com.google.common.base.Throwables;
6 import com.google.common.collect.*;
7 import eu.fbk.dkm.pikes.naflib.Corpus;
8 import eu.fbk.dkm.pikes.resources.NAFUtils;
9 import eu.fbk.dkm.pikes.resources.WordNet;
10 import eu.fbk.utils.core.CommandLine;
11 import eu.fbk.utils.core.CommandLine.Type;
12 import eu.fbk.utils.svm.Util;
13 import eu.fbk.rdfpro.util.Tracker;
14 import ixa.kaflib.KAFDocument;
15 import ixa.kaflib.Opinion;
16 import ixa.kaflib.Opinion.OpinionExpression;
17 import org.slf4j.Logger;
18 import org.slf4j.LoggerFactory;
19
20 import javax.annotation.Nullable;
21 import java.io.BufferedReader;
22 import java.io.BufferedWriter;
23 import java.io.File;
24 import java.io.IOException;
25 import java.lang.reflect.Constructor;
26 import java.lang.reflect.InvocationTargetException;
27 import java.nio.file.Files;
28 import java.nio.file.Path;
29 import java.nio.file.Paths;
30 import java.nio.file.StandardOpenOption;
31 import java.util.*;
32 import java.util.function.Function;
33 import java.util.stream.StreamSupport;
34
35 public abstract class Extractor {
36
37 private static final Logger LOGGER = LoggerFactory.getLogger(Extractor.class);
38
39 public final void extract(final Iterable<KAFDocument> documents, final String outLabel,
40 final Component... components) {
41
42
43 final EnumSet<Component> componentSet = Component.toSet(components);
44 StreamSupport.stream(documents.spliterator(), true).forEach(document -> {
45 Preconditions.checkNotNull(document);
46 synchronized (document) {
47 extract(document, outLabel, componentSet);
48 }
49 });
50 }
51
52 public final void extract(final KAFDocument document, @Nullable final String outLabel,
53 final Component... components) {
54
55
56 Preconditions.checkNotNull(document);
57 final EnumSet<Component> componentSet = Component.toSet(components);
58 synchronized (document) {
59 extract(document, outLabel, componentSet);
60 }
61 }
62
63 public final void refine(final Iterable<KAFDocument> documents,
64 @Nullable final Iterable<String> inLabels, @Nullable final String outLabel,
65 final Component... components) {
66
67
68 final EnumSet<Component> componentSet = Component.toSet(components);
69 StreamSupport.stream(documents.spliterator(), true).forEach(document -> {
70 Preconditions.checkNotNull(document);
71 synchronized (document) {
72 refine(document, inLabels, outLabel, componentSet);
73 }
74 });
75 }
76
77 public final void refine(final KAFDocument document,
78 @Nullable final Iterable<String> inLabels, @Nullable final String outLabel,
79 final Component... components) {
80
81
82 Preconditions.checkNotNull(document);
83 final EnumSet<Component> componentSet = Component.toSet(components);
84 synchronized (document) {
85 refine(document, inLabels, outLabel, componentSet);
86 }
87 }
88
89 private void extract(final KAFDocument document, @Nullable final String outLabel,
90 final EnumSet<Component> components) {
91
92
93 doFilter(document);
94
95
96 for (final Opinion opinion : document.getOpinions(outLabel)) {
97 document.removeAnnotation(opinion);
98 }
99
100
101 final int numSentences = document.getNumSentences() + 1;
102 for (int i = 0; i < numSentences; ++i) {
103
104
105 final Iterable<Opinion> opinions = doExtract(document, i, components);
106
107
108 Opinions.retain(opinions, null, components);
109 for (final Opinion opinion : opinions) {
110 opinion.setLabel(outLabel);
111 }
112 }
113 }
114
115 private void refine(final KAFDocument document, @Nullable final Iterable<String> inLabels,
116 @Nullable final String outLabel, final EnumSet<Component> components) {
117
118
119 doFilter(document);
120
121
122 List<Opinion> opinions;
123 if (inLabels == null || Iterables.isEmpty(inLabels)) {
124 opinions = Lists.newArrayList(document.getOpinions());
125 } else {
126 opinions = Lists.newArrayList();
127 for (final String inLabel : inLabels) {
128 opinions.addAll(document.getOpinions(inLabel));
129 }
130 }
131
132
133 final ListMultimap<Integer, Opinion> inOpinions = ArrayListMultimap.create();
134 for (final Opinion opinion : opinions) {
135 final OpinionExpression exp = opinion.getOpinionExpression();
136 inOpinions.put(exp.getSpan().getTargets().get(0).getSent(), opinion);
137 }
138
139
140 document.removeAnnotations(document.getOpinions(outLabel));
141
142
143 for (final Map.Entry<Integer, Collection<Opinion>> entry : inOpinions.asMap().entrySet()) {
144
145
146 final int sentenceID = entry.getKey();
147 final List<Opinion> refinedOpinions = Lists.newArrayList();
148 for (final Opinion inOpinion : entry.getValue()) {
149
150
151 final Iterable<Opinion> outOpinions = doRefine(document, sentenceID, components,
152 inOpinion);
153 if (Iterables.isEmpty(outOpinions)) {
154 System.out.println("*******");
155 }
156 Iterables.addAll(refinedOpinions, outOpinions);
157
158
159 Opinions.retain(outOpinions, inOpinion, components);
160 for (final Opinion outOpinion : outOpinions) {
161 outOpinion.setLabel(outLabel);
162 }
163 }
164 Opinions.deduplicate(document, refinedOpinions);
165 }
166 }
167
168 protected void doFilter(final KAFDocument document) {
169
170 }
171
172 protected abstract Iterable<Opinion> doExtract(KAFDocument document, int sentenceID,
173 EnumSet<Component> components);
174
175 protected abstract Iterable<Opinion> doRefine(KAFDocument document, int sentenceID,
176 EnumSet<Component> components, Opinion opinion);
177
178 protected void doWrite(final Properties properties, final Path path) throws IOException {
179
180 }
181
182 public final void writeTo(final Path path) throws IOException {
183
184 final Path p = Util.openVFS(path, true);
185 try {
186
187 final Properties properties = new Properties();
188
189
190 doWrite(properties, p);
191
192
193 properties.put("class", getClass().getName());
194
195
196 try (BufferedWriter writer = Files.newBufferedWriter(p.resolve("properties"),
197 StandardOpenOption.TRUNCATE_EXISTING, StandardOpenOption.CREATE)) {
198 properties.store(writer, "");
199 }
200
201 } finally {
202 Util.closeVFS(p);
203 }
204 }
205
206 public static Extractor readFrom(final Path path, @Nullable final Properties customProperties)
207 throws IOException {
208
209 final Path p = Util.openVFS(path, false);
210 try {
211
212 final Properties properties = new Properties();
213 try (BufferedReader reader = Files.newBufferedReader(p.resolve("properties"))) {
214 properties.load(reader);
215 }
216 if (customProperties != null) {
217 properties.putAll(customProperties);
218 }
219
220
221 final String implementationName = properties.getProperty("class");
222 try {
223 final Class<?> implementationClass = Class.forName(implementationName);
224 final Constructor<?> constructor = implementationClass.getDeclaredConstructor(
225 Properties.class, Path.class);
226 constructor.setAccessible(true);
227 return (Extractor) constructor.newInstance(properties, p);
228 } catch (final InvocationTargetException ex) {
229 Throwables.propagateIfInstanceOf(ex, IOException.class);
230 throw Throwables.propagate(ex);
231 } catch (final NoSuchMethodException | ClassNotFoundException | IllegalAccessException
232 | InstantiationException ex) {
233 throw new IllegalArgumentException("Could not instantiate class "
234 + implementationName);
235 }
236
237 } finally {
238 Util.closeVFS(p);
239 }
240 }
241
242 public static void main(final String... args) {
243
244 try {
245
246 final CommandLine cmd = CommandLine
247 .parser()
248 .withName("fssa-extract")
249 .withHeader(
250 "Extracts opinion expressions, holders and targets "
251 + "from one or multiple NAF files.")
252 .withOption("p", "properties", "a sequence of key=value properties, used to "
253 + "select and configure the trainer", "PROPS", Type.STRING, true,
254 false, false)
255 .withOption("c", "components", "the opinion components to consider: "
256 + "(e)xpression, (h)older, (t)arget, (p)olarity", "COMP", Type.STRING,
257 true, false, false)
258 .withOption("m", "model", "the extractor model", "FILE", Type.FILE_EXISTING,
259 true, false, true)
260 .withOption("l", "labels", "the labels of the existing opinions to modify "
261 + "(refine mode), comma separated (no spaces)", "LABELS",
262 Type.STRING, true, false, false)
263 .withOption("b", "outlabel", "the label to associate to produced opinions",
264 "LABEL", Type.STRING, true, false, false)
265 .withOption("r", "recursive",
266 "recurse into subdirectories of specified input paths")
267 .withOption("@", "list",
268 "interprets input as list of file names, one per line")
269 .withOption("o", "output", "the output file or directory name", "FILE",
270 Type.FILE, true, false, false)
271 .withOption("f", "format", "the output format", "FMT", Type.STRING, true,
272 false, false)
273 .withOption("j", "junk",
274 "junk path structure in input files and emits a flat list of files")
275 .withOption(null, "wordnet", "wordnet dict path", "PATH",
276 Type.DIRECTORY_EXISTING, true, false, false)
277 .withFooter(
278 "Zero or more input paths can be specified, corresponding either "
279 + "to NAF files or\ndirectories that are scanned for NAF "
280 + "files. If the list is empty, an input NAF\nfile will be "
281 + "read from the standard input. If no output path is "
282 + "specified (-o),\noutput is written to standard output.")
283 .withLogger(LoggerFactory.getLogger("eu.fbk"))
284 .parse(args);
285
286
287 final Properties properties = Util.parseProperties(cmd.getOptionValue("p",
288 String.class, ""));
289 final Component[] components = Component.forLetters(
290 cmd.getOptionValue("c", String.class, "")).toArray(new Component[0]);
291 final Set<String> inputLabels = ImmutableSet.copyOf(Splitter.on(',')
292 .omitEmptyStrings().split(cmd.getOptionValue("l", String.class, "")));
293 final String outputLabel = cmd.getOptionValue("b", String.class, null);
294 final Path modelPath = cmd.getOptionValue("m", Path.class);
295 final boolean recursive = cmd.hasOption("r");
296 final boolean list = cmd.hasOption("@");
297 final File outputPath = cmd.getOptionValue("o", File.class, null);
298 final String format = cmd.getOptionValue("f", String.class, ".naf");
299 final boolean junk = cmd.hasOption("j");
300 final List<Path> inputPaths = Lists.newArrayList(cmd.getArgs(Path.class));
301
302 final String wordnetPath = cmd.getOptionValue("wordnet", String.class);
303 if (wordnetPath != null) {
304 WordNet.setPath(wordnetPath);
305 }
306
307
308 final Extractor extractor = readFrom(modelPath, properties);
309
310
311 final List<Path> files = Util.fileMatch(inputPaths, ImmutableList.of(".naf",
312 ".naf.gz", ".naf.bz2", ".naf.xz", ".xml", ".xml.gz", ".xml.bz2", ".xml.xz"),
313 recursive, list);
314 final Iterable<KAFDocument> documents = files != null ? Corpus.create(false, files)
315 : ImmutableList.of(NAFUtils.readDocument(null));
316
317
318 final Function<String, String> namer = outputPath == null ? null
319 : Util.fileRenamer("", outputPath.getAbsolutePath() + "/", format, junk);
320 final Tracker tracker = new Tracker(LOGGER, null,
321 "Processed %d NAF files (%d NAF/s avg)",
322 "Processed %d NAF files (%d NAF/s, %d NAF/s avg)");
323 tracker.start();
324 StreamSupport.stream(documents.spliterator(), false).forEach(
325 (final KAFDocument document) -> {
326 if (inputLabels.isEmpty()) {
327 extractor.extract(document, outputLabel, components);
328 } else {
329 extractor.refine(document, inputLabels, outputLabel, components);
330 }
331 try {
332 NAFUtils.writeDocument(document, namer == null ? null
333 : Paths.get(namer.apply(document.getPublic().publicId)));
334 } catch (final IOException ex) {
335 throw Throwables.propagate(ex);
336 }
337 tracker.increment();
338 });
339 tracker.end();
340
341 } catch (final Throwable ex) {
342 CommandLine.fail(ex);
343 }
344 }
345
346 }