1   package eu.fbk.dkm.pikes.raid;
2   
3   import com.google.common.base.Preconditions;
4   import com.google.common.base.Splitter;
5   import com.google.common.base.Throwables;
6   import com.google.common.collect.*;
7   import eu.fbk.dkm.pikes.naflib.Corpus;
8   import eu.fbk.dkm.pikes.resources.NAFUtils;
9   import eu.fbk.dkm.pikes.resources.WordNet;
10  import eu.fbk.utils.core.CommandLine;
11  import eu.fbk.utils.core.CommandLine.Type;
12  import eu.fbk.utils.svm.Util;
13  import eu.fbk.rdfpro.util.Tracker;
14  import ixa.kaflib.KAFDocument;
15  import ixa.kaflib.Opinion;
16  import ixa.kaflib.Opinion.OpinionExpression;
17  import org.slf4j.Logger;
18  import org.slf4j.LoggerFactory;
19  
20  import javax.annotation.Nullable;
21  import java.io.BufferedReader;
22  import java.io.BufferedWriter;
23  import java.io.File;
24  import java.io.IOException;
25  import java.lang.reflect.Constructor;
26  import java.lang.reflect.InvocationTargetException;
27  import java.nio.file.Files;
28  import java.nio.file.Path;
29  import java.nio.file.Paths;
30  import java.nio.file.StandardOpenOption;
31  import java.util.*;
32  import java.util.function.Function;
33  import java.util.stream.StreamSupport;
34  
35  public abstract class Extractor {
36  
37      private static final Logger LOGGER = LoggerFactory.getLogger(Extractor.class);
38  
39      public final void extract(final Iterable<KAFDocument> documents, final String outLabel,
40              final Component... components) {
41  
42          // Validate components and process supplied documents using parallelization
43          final EnumSet<Component> componentSet = Component.toSet(components);
44          StreamSupport.stream(documents.spliterator(), true).forEach(document -> {
45              Preconditions.checkNotNull(document);
46              synchronized (document) {
47                  extract(document, outLabel, componentSet);
48              }
49          });
50      }
51  
52      public final void extract(final KAFDocument document, @Nullable final String outLabel,
53              final Component... components) {
54  
55          // Validate document and components, then delegate
56          Preconditions.checkNotNull(document);
57          final EnumSet<Component> componentSet = Component.toSet(components);
58          synchronized (document) {
59              extract(document, outLabel, componentSet);
60          }
61      }
62  
63      public final void refine(final Iterable<KAFDocument> documents,
64              @Nullable final Iterable<String> inLabels, @Nullable final String outLabel,
65              final Component... components) {
66  
67          // Validate components and process supplied documents, possibly using parallelization
68          final EnumSet<Component> componentSet = Component.toSet(components);
69          StreamSupport.stream(documents.spliterator(), true).forEach(document -> {
70              Preconditions.checkNotNull(document);
71              synchronized (document) {
72                  refine(document, inLabels, outLabel, componentSet);
73              }
74          });
75      }
76  
77      public final void refine(final KAFDocument document,
78              @Nullable final Iterable<String> inLabels, @Nullable final String outLabel,
79              final Component... components) {
80  
81          // Validate document and components, then delegate
82          Preconditions.checkNotNull(document);
83          final EnumSet<Component> componentSet = Component.toSet(components);
84          synchronized (document) {
85              refine(document, inLabels, outLabel, componentSet);
86          }
87      }
88  
89      private void extract(final KAFDocument document, @Nullable final String outLabel,
90              final EnumSet<Component> components) {
91  
92          // Filter the document
93          doFilter(document);
94  
95          // Remove all the opinions for the output label
96          for (final Opinion opinion : document.getOpinions(outLabel)) {
97              document.removeAnnotation(opinion);
98          }
99  
100         // Process the document one sentence at a time
101         final int numSentences = document.getNumSentences() + 1;
102         for (int i = 0; i < numSentences; ++i) {
103 
104             // Extract opinions from current sentence graph
105             final Iterable<Opinion> opinions = doExtract(document, i, components);
106 
107             // Ensure that extracted opinions contain only requested components
108             Opinions.retain(opinions, null, components);
109             for (final Opinion opinion : opinions) {
110                 opinion.setLabel(outLabel);
111             }
112         }
113     }
114 
115     private void refine(final KAFDocument document, @Nullable final Iterable<String> inLabels,
116             @Nullable final String outLabel, final EnumSet<Component> components) {
117 
118         // Filter the document
119         doFilter(document);
120 
121         // Identify the opinions to modify
122         List<Opinion> opinions;
123         if (inLabels == null || Iterables.isEmpty(inLabels)) {
124             opinions = Lists.newArrayList(document.getOpinions());
125         } else {
126             opinions = Lists.newArrayList();
127             for (final String inLabel : inLabels) {
128                 opinions.addAll(document.getOpinions(inLabel));
129             }
130         }
131 
132         // Index the resulting opinions by sentence
133         final ListMultimap<Integer, Opinion> inOpinions = ArrayListMultimap.create();
134         for (final Opinion opinion : opinions) {
135             final OpinionExpression exp = opinion.getOpinionExpression();
136             inOpinions.put(exp.getSpan().getTargets().get(0).getSent(), opinion);
137         }
138 
139         // Remove all the opinions for the output label
140         document.removeAnnotations(document.getOpinions(outLabel));
141 
142         // Perform refining, processing all the sentences for which at least an opinion is defined
143         for (final Map.Entry<Integer, Collection<Opinion>> entry : inOpinions.asMap().entrySet()) {
144 
145             // Process all the opinions of the sentence
146             final int sentenceID = entry.getKey();
147             final List<Opinion> refinedOpinions = Lists.newArrayList();
148             for (final Opinion inOpinion : entry.getValue()) {
149 
150                 // Perform refining
151                 final Iterable<Opinion> outOpinions = doRefine(document, sentenceID, components,
152                         inOpinion);
153                 if (Iterables.isEmpty(outOpinions)) {
154                     System.out.println("*******");
155                 }
156                 Iterables.addAll(refinedOpinions, outOpinions);
157 
158                 // Ensure resulting opinions preserve components that don't have to be refined
159                 Opinions.retain(outOpinions, inOpinion, components);
160                 for (final Opinion outOpinion : outOpinions) {
161                     outOpinion.setLabel(outLabel);
162                 }
163             }
164             Opinions.deduplicate(document, refinedOpinions);
165         }
166     }
167 
168     protected void doFilter(final KAFDocument document) {
169         // can be overridden by subclasses
170     }
171 
172     protected abstract Iterable<Opinion> doExtract(KAFDocument document, int sentenceID,
173             EnumSet<Component> components);
174 
175     protected abstract Iterable<Opinion> doRefine(KAFDocument document, int sentenceID,
176             EnumSet<Component> components, Opinion opinion);
177 
178     protected void doWrite(final Properties properties, final Path path) throws IOException {
179         // can be overridden by subclasses
180     }
181 
182     public final void writeTo(final Path path) throws IOException {
183 
184         final Path p = Util.openVFS(path, true);
185         try {
186             // Create properties
187             final Properties properties = new Properties();
188 
189             // Store custom entries
190             doWrite(properties, p);
191 
192             // Store name of implementation class
193             properties.put("class", getClass().getName());
194 
195             // Add a file for the properties
196             try (BufferedWriter writer = Files.newBufferedWriter(p.resolve("properties"),
197                     StandardOpenOption.TRUNCATE_EXISTING, StandardOpenOption.CREATE)) {
198                 properties.store(writer, "");
199             }
200 
201         } finally {
202             Util.closeVFS(p);
203         }
204     }
205 
206     public static Extractor readFrom(final Path path, @Nullable final Properties customProperties)
207             throws IOException {
208 
209         final Path p = Util.openVFS(path, false);
210         try {
211             // Read stored properties entry and apply custom properties
212             final Properties properties = new Properties();
213             try (BufferedReader reader = Files.newBufferedReader(p.resolve("properties"))) {
214                 properties.load(reader);
215             }
216             if (customProperties != null) {
217                 properties.putAll(customProperties);
218             }
219 
220             // Select the implementation class and delegate to its doRead() static method
221             final String implementationName = properties.getProperty("class");
222             try {
223                 final Class<?> implementationClass = Class.forName(implementationName);
224                 final Constructor<?> constructor = implementationClass.getDeclaredConstructor(
225                         Properties.class, Path.class);
226                 constructor.setAccessible(true);
227                 return (Extractor) constructor.newInstance(properties, p);
228             } catch (final InvocationTargetException ex) {
229                 Throwables.propagateIfInstanceOf(ex, IOException.class);
230                 throw Throwables.propagate(ex);
231             } catch (final NoSuchMethodException | ClassNotFoundException | IllegalAccessException
232                     | InstantiationException ex) {
233                 throw new IllegalArgumentException("Could not instantiate class "
234                         + implementationName);
235             }
236 
237         } finally {
238             Util.closeVFS(p);
239         }
240     }
241 
242     public static void main(final String... args) {
243 
244         try {
245             // Parse command line
246             final CommandLine cmd = CommandLine
247                     .parser()
248                     .withName("fssa-extract")
249                     .withHeader(
250                             "Extracts opinion expressions, holders and targets "
251                                     + "from one or multiple NAF files.")
252                     .withOption("p", "properties", "a sequence of key=value properties, used to " //
253                             + "select and configure the trainer", "PROPS", Type.STRING, true,
254                             false, false)
255                     .withOption("c", "components", "the opinion components to consider: " //
256                             + "(e)xpression, (h)older, (t)arget, (p)olarity", "COMP", Type.STRING,
257                             true, false, false)
258                     .withOption("m", "model", "the extractor model", "FILE", Type.FILE_EXISTING,
259                             true, false, true)
260                     .withOption("l", "labels", "the labels of the existing opinions to modify " //
261                             + "(refine mode), comma separated  (no spaces)", "LABELS",
262                             Type.STRING, true, false, false)
263                     .withOption("b", "outlabel", "the label to associate to produced opinions",
264                             "LABEL", Type.STRING, true, false, false)
265                     .withOption("r", "recursive",
266                             "recurse into subdirectories of specified input paths")
267                     .withOption("@", "list",
268                             "interprets input as list of file names, one per line")
269                     .withOption("o", "output", "the output file or directory name", "FILE",
270                             Type.FILE, true, false, false)
271                     .withOption("f", "format", "the output format", "FMT", Type.STRING, true,
272                             false, false)
273                     .withOption("j", "junk",
274                             "junk path structure in input files and emits a flat list of files")
275                     .withOption(null, "wordnet", "wordnet dict path", "PATH",
276                             Type.DIRECTORY_EXISTING, true, false, false)
277                     .withFooter(
278                             "Zero or more input paths can be specified, corresponding either "
279                                     + "to NAF files or\ndirectories that are scanned for NAF "
280                                     + "files. If the list is empty, an input NAF\nfile will be "
281                                     + "read from the standard input. If no output path is "
282                                     + "specified (-o),\noutput is written to standard output.")
283                     .withLogger(LoggerFactory.getLogger("eu.fbk")) //
284                     .parse(args);
285 
286             // Extract options
287             final Properties properties = Util.parseProperties(cmd.getOptionValue("p",
288                     String.class, ""));
289             final Component[] components = Component.forLetters(
290                     cmd.getOptionValue("c", String.class, "")).toArray(new Component[0]);
291             final Set<String> inputLabels = ImmutableSet.copyOf(Splitter.on(',')
292                     .omitEmptyStrings().split(cmd.getOptionValue("l", String.class, "")));
293             final String outputLabel = cmd.getOptionValue("b", String.class, null);
294             final Path modelPath = cmd.getOptionValue("m", Path.class);
295             final boolean recursive = cmd.hasOption("r");
296             final boolean list = cmd.hasOption("@");
297             final File outputPath = cmd.getOptionValue("o", File.class, null);
298             final String format = cmd.getOptionValue("f", String.class, ".naf");
299             final boolean junk = cmd.hasOption("j");
300             final List<Path> inputPaths = Lists.newArrayList(cmd.getArgs(Path.class));
301 
302             final String wordnetPath = cmd.getOptionValue("wordnet", String.class);
303             if (wordnetPath != null) {
304                 WordNet.setPath(wordnetPath);
305             }
306 
307             // Setup the opinion extractor
308             final Extractor extractor = readFrom(modelPath, properties);
309 
310             // Identify input
311             final List<Path> files = Util.fileMatch(inputPaths, ImmutableList.of(".naf",
312                     ".naf.gz", ".naf.bz2", ".naf.xz", ".xml", ".xml.gz", ".xml.bz2", ".xml.xz"),
313                     recursive, list);
314             final Iterable<KAFDocument> documents = files != null ? Corpus.create(false, files)
315                     : ImmutableList.of(NAFUtils.readDocument(null));
316 
317             // Perform the extraction
318             final Function<String, String> namer = outputPath == null ? null //
319                     : Util.fileRenamer("", outputPath.getAbsolutePath() + "/", format, junk);
320             final Tracker tracker = new Tracker(LOGGER, null, //
321                     "Processed %d NAF files (%d NAF/s avg)", //
322                     "Processed %d NAF files (%d NAF/s, %d NAF/s avg)");
323             tracker.start();
324             StreamSupport.stream(documents.spliterator(), false).forEach(
325                     (final KAFDocument document) -> {
326                         if (inputLabels.isEmpty()) {
327                             extractor.extract(document, outputLabel, components);
328                         } else {
329                             extractor.refine(document, inputLabels, outputLabel, components);
330                         }
331                         try {
332                             NAFUtils.writeDocument(document, namer == null ? null //
333                                     : Paths.get(namer.apply(document.getPublic().publicId)));
334                         } catch (final IOException ex) {
335                             throw Throwables.propagate(ex);
336                         }
337                         tracker.increment();
338                     });
339             tracker.end();
340 
341         } catch (final Throwable ex) {
342             CommandLine.fail(ex);
343         }
344     }
345 
346 }