1   package eu.fbk.dkm.pikes.resources;
2   
3   import com.google.common.base.Charsets;
4   import com.google.common.base.Joiner;
5   import com.google.common.base.Splitter;
6   import com.google.common.base.Strings;
7   import com.google.common.collect.*;
8   import com.google.common.io.Resources;
9   import eu.fbk.utils.core.CommandLine;
10  import eu.fbk.utils.core.StaxParser;
11  import org.slf4j.LoggerFactory;
12  
13  import javax.annotation.Nullable;
14  import javax.xml.stream.XMLStreamException;
15  import java.io.*;
16  import java.util.Arrays;
17  import java.util.Collections;
18  import java.util.List;
19  import java.util.Map;
20  
21  public final class PropBank {
22  
23      private static final List<Roleset> ROLESETS;
24  
25      private static final Map<String, Roleset> ID_INDEX;
26  
27      private static final ListMultimap<String, Roleset> LEMMA_INDEX;
28  
29      static {
30          try {
31              final Map<String, int[]> corefMap = Maps.newHashMap();
32              for (final String line : Resources.readLines(
33                      PropBank.class.getResource("PropBank.coref"), Charsets.UTF_8)) {
34                  final String[] tokens = line.split("\\s+");
35                  final int[] roles = new int[] { Integer.parseInt(tokens[1]),
36                          Integer.parseInt(tokens[2]) };
37                  corefMap.put(tokens[0], roles);
38              }
39  
40              final Map<String, Roleset> idIndex = Maps.newLinkedHashMap();
41              final ListMultimap<String, Roleset> lemmaIndex = ArrayListMultimap.create();
42  
43              final BufferedReader reader = Resources.asCharSource(
44                      PropBank.class.getResource("PropBank.tsv"), Charsets.UTF_8)
45                      .openBufferedStream();
46  
47              String line;
48              while ((line = reader.readLine()) != null) {
49  
50                  // Extract frame data
51                  final String[] tokens = Iterables.toArray(Splitter.on('\t').split(line),
52                          String.class);
53                  final String id = tokens[0];
54                  final String lemma = tokens[1];
55                  final String name = tokens[2];
56                  final List<String> vnFrames = Splitter.on('|').splitToList(tokens[3]);
57                  final List<String> fnFrames = Splitter.on('|').splitToList(tokens[4]);
58                  final List<String> eventTypes = Splitter.on('|').splitToList(tokens[5]);
59  
60                  // Extract role data
61                  final List<String> argDescr = Lists.newArrayList();
62                  final List<List<String>> argVNRoles = Lists.newArrayList();
63                  final List<List<String>> argFNRoles = Lists.newArrayList();
64                  for (int i = 0; i < 6; ++i) {
65                      argDescr.add(null);
66                      argVNRoles.add(null);
67                      argFNRoles.add(null);
68                  }
69                  for (int i = 6; i + 3 < tokens.length; i += 4) {
70                      final int num = Integer.parseInt(tokens[i]);
71                      argDescr.set(num, tokens[i + 1]);
72                      argVNRoles.set(num, Splitter.on('|').splitToList(tokens[i + 2]));
73                      argFNRoles.set(num, Splitter.on('|').splitToList(tokens[i + 3]));
74                  }
75  
76                  // Create and index the roleset
77                  final int[] corefRoles = corefMap.get(id);
78                  final int entityRole = corefRoles == null ? -1 : corefRoles[0];
79                  final int predicateRole = corefRoles == null ? -1 : corefRoles[1];
80                  final Roleset roleset = new Roleset(id, lemma, name, vnFrames, fnFrames,
81                          eventTypes, argDescr, argVNRoles, argFNRoles, entityRole, predicateRole);
82                  idIndex.put(id, roleset);
83                  lemmaIndex.put(lemma, roleset);
84              }
85  
86              reader.close();
87  
88              ROLESETS = ImmutableList.copyOf(idIndex.values());
89              ID_INDEX = ImmutableMap.copyOf(idIndex);
90              LEMMA_INDEX = ImmutableListMultimap.copyOf(lemmaIndex);
91  
92          } catch (final IOException ex) {
93              throw new Error("Cannot load eu.fbk.dkm.pikes.resources.PropBank data", ex);
94          }
95      }
96  
97      @Nullable
98      public static Roleset getRoleset(@Nullable final String id) {
99          return ID_INDEX.get(id == null ? null : id.toLowerCase());
100     }
101 
102     public static List<Roleset> getRolesets(@Nullable final String lemma) {
103         return LEMMA_INDEX.get(lemma == null ? null : lemma.toLowerCase());
104     }
105 
106     public static List<Roleset> getRolesets() {
107         return ROLESETS;
108     }
109 
110     public static void main(final String[] args) throws IOException, XMLStreamException {
111 
112         try {
113             final CommandLine cmd = CommandLine
114                     .parser()
115                     .withName("PropBankBank")
116                     .withHeader(
117                             "Generate a TSV file with indexed eu.fbk.dkm.pikes.resources.PropBank data, "
118                                     + "including mapping to eu.fbk.dkm.pikes.resources.VerbNet and eu.fbk.dkm.pikes.resources.FrameNet from the PredicateMatrix")
119                     .withOption("f", "frames", "the directory containing frame definitions",
120                             "DIR", CommandLine.Type.DIRECTORY_EXISTING, true, false, true)
121                     .withOption("m", "matrix", "the file containing the predicate matrix", "FILE",
122                             CommandLine.Type.FILE_EXISTING, true, false, true)
123                     .withOption("o", "output", "output file", "FILE", CommandLine.Type.FILE, true, false, true)
124                     .withLogger(LoggerFactory.getLogger("eu.fbk.nafview")).parse(args);
125 
126             final File dir = cmd.getOptionValue("f", File.class);
127             final File pm = cmd.getOptionValue("m", File.class);
128             final File output = cmd.getOptionValue("o", File.class);
129 
130             // Parse the predicate matrix
131             final Matrix matrix = new Matrix(pm);
132 
133             final Writer writer = new OutputStreamWriter(new BufferedOutputStream(
134                     new FileOutputStream(output)), Charsets.UTF_8);
135 
136             final File[] files = dir.listFiles();
137             Arrays.sort(files);
138 
139             for (final File file : files) {
140                 if (file.getName().endsWith(".xml")) {
141                     System.out.println("Processing " + file);
142                     final Reader reader = new BufferedReader(new FileReader(file));
143                     try {
144                         new Parser(reader, matrix).parse(writer);
145                     } finally {
146                         reader.close();
147                     }
148                 }
149             }
150 
151         } catch (final Throwable ex) {
152             CommandLine.fail(ex);
153         }
154     }
155 
156     private static class Matrix {
157 
158         final Multimap<String, String> vnFrames;
159 
160         final Multimap<String, String> fnFrames;
161 
162         final Multimap<String, String> eventTypes;
163 
164         final Multimap<String, String> vnRoles;
165 
166         final Multimap<String, String> fnRoles;
167 
168         Matrix(final File file) throws IOException {
169 
170             this.vnFrames = HashMultimap.create();
171             this.fnFrames = HashMultimap.create();
172             this.eventTypes = HashMultimap.create();
173             this.vnRoles = HashMultimap.create();
174             this.fnRoles = HashMultimap.create();
175 
176             parseMatrix(file);
177         }
178 
179         private void parseMatrix(final File matrixFile) throws IOException {
180 
181             final BufferedReader in = new BufferedReader(new InputStreamReader(
182                     new FileInputStream(matrixFile), Charsets.UTF_8));
183 
184             try {
185                 // Process the predicate matrix file one line at a time
186                 String line;
187                 while ((line = in.readLine()) != null) {
188 
189                     // Split the line in its cells. Skip line if there are not enough cells
190                     final String[] tokens = line.split("\t");
191                     if (tokens.length <= 18) {
192                         continue;
193                     }
194 
195                     // Extract the eu.fbk.dkm.pikes.resources.PropBank frame and role. Skip line if NULL
196                     final String pbFrame = parseMatrixValue(tokens[11]);
197                     if (pbFrame == null) {
198                         continue;
199                     }
200                     final String pbRole = parseMatrixValue(tokens[12]);
201                     final String pbFrameRole = pbFrame + pbRole;
202 
203                     // Extract and index eu.fbk.dkm.pikes.resources.VerbNet data: class, subclass, role
204                     final String vnClass = parseMatrixValue(tokens[0]);
205                     final String vnSubClass = parseMatrixValue(tokens[2]);
206                     final String vnFrame = vnSubClass != null ? vnSubClass : vnClass;
207                     final String vnRole = parseMatrixValue(tokens[5]);
208                     if (vnSubClass != null && vnClass != null && !vnSubClass.startsWith(vnClass)) {
209                         System.err.println("Unexpected VN class / subclass pair: " + vnClass
210                                 + ", " + vnSubClass);
211                     }
212                     if (vnFrame != null) {
213                         this.vnFrames.put(pbFrame, vnFrame);
214                         if (vnRole != null) {
215                             this.vnRoles.put(pbFrameRole, vnRole);
216                         }
217                     }
218 
219                     // Extract and index eu.fbk.dkm.pikes.resources.FrameNet data: frame and frame element
220                     final String fnFrame = parseMatrixValue(tokens[8]);
221                     final String fnRole = parseMatrixValue(tokens[10]);
222                     if (fnFrame != null) {
223                         this.fnFrames.put(pbFrame, fnFrame);
224                         if (fnRole != null) {
225                             this.fnRoles.put(pbFrameRole, fnRole);
226                         }
227                     }
228 
229                     // Extract and index event type
230                     final String eventType = parseMatrixValue(tokens[17]);
231                     if (eventType != null) {
232                         this.eventTypes.put(pbFrame, eventType);
233                     }
234                 }
235             } finally {
236                 in.close();
237             }
238         }
239 
240         @Nullable
241         private static String parseMatrixValue(@Nullable String string) {
242 
243             if (string != null) {
244 
245                 // Skip an optional prefix (e.g., pb:)
246                 final int index = string.indexOf(':');
247                 if (index > 0) {
248                     string = string.substring(index + 1);
249                 }
250 
251                 // Return the value only if not NULL
252                 if (!"NULL".equalsIgnoreCase(string)) {
253                     return string;
254                 }
255             }
256             return null;
257         }
258 
259     }
260 
261     private static class Parser extends StaxParser {
262 
263         private final Matrix matrix;
264 
265         Parser(final Reader reader, @Nullable final Matrix matrix) throws IOException {
266             super(reader);
267             this.matrix = matrix;
268         }
269 
270         void parse(final Writer writer) throws IOException, XMLStreamException {
271             enter("frameset");
272             while (tryEnter("predicate")) {
273 
274                 // Extract the lemma (may be different from the one in the ID
275                 final String lemma = attribute("lemma").trim().replace('_', ' ').toLowerCase();
276 
277                 // Process rolesets for the current predicate lemma
278                 while (tryEnter("roleset")) {
279 
280                     // Extract eu.fbk.dkm.pikes.resources.PropBank sense and associated description
281                     final String id = attribute("id").trim();
282                     final String name = attribute("name").trim();
283 
284                     // Retrieve frame data from the predicate matrix
285                     final String vnFrames = Joiner.on('|').join(
286                             Ordering.natural().sortedCopy(this.matrix.vnFrames.get(id)));
287                     final String fnFrames = Joiner.on('|').join(
288                             Ordering.natural().sortedCopy(this.matrix.fnFrames.get(id)));
289                     final String eventTypes = Joiner.on('|').join(
290                             Ordering.natural().sortedCopy(this.matrix.eventTypes.get(id)));
291 
292                     // Emit frame data
293                     writer.write(id);
294                     writer.write('\t');
295                     writer.write(lemma);
296                     writer.write('\t');
297                     writer.write(name);
298                     writer.write('\t');
299                     writer.write(vnFrames);
300                     writer.write('\t');
301                     writer.write(fnFrames);
302                     writer.write('\t');
303                     writer.write(eventTypes);
304 
305                     // Process eu.fbk.dkm.pikes.resources.PropBank roles for current roleset
306                     if (tryEnter("roles")) {
307                         while (tryEnter("role")) {
308                             try {
309 
310                                 // Extract role number and associated description
311                                 final int n = Integer.parseInt(attribute("n"));
312                                 final String descr = attribute("descr").trim();
313 
314                                 // Retrieve role data from the predicate matrix
315                                 final String roleId = id + n;
316                                 final String vnRoles = Joiner.on('|').join(
317                                         Ordering.natural().sortedCopy(
318                                                 this.matrix.vnRoles.get(roleId)));
319                                 final String fnRoles = Joiner.on('|').join(
320                                         Ordering.natural().sortedCopy(
321                                                 this.matrix.fnRoles.get(roleId)));
322 
323                                 // Emit role data
324                                 writer.write('\t');
325                                 writer.write(Integer.toString(n));
326                                 writer.write('\t');
327                                 writer.write(Strings.nullToEmpty(descr));
328                                 writer.write('\t');
329                                 writer.write(vnRoles);
330                                 writer.write('\t');
331                                 writer.write(fnRoles);
332 
333                             } catch (final NumberFormatException ex) {
334                                 // ignore
335                             }
336                             leave();
337                         }
338                         leave();
339                     }
340 
341                     // End and flush the line
342                     writer.write('\n');
343                     writer.flush();
344                     leave();
345                 }
346                 leave();
347             }
348             leave();
349         }
350 
351     }
352 
353     public static final class Roleset {
354 
355         private static final Interner<Object> INTERNER = Interners.newStrongInterner();
356 
357         private final String id;
358 
359         private final String lemma;
360 
361         private final String descr;
362 
363         private final List<String> vnFrames;
364 
365         private final List<String> fnFrames;
366 
367         private final List<String> eventTypes;
368 
369         private final String[] argDescr;
370 
371         private final List<String>[] argVNRoles;
372 
373         private final List<String>[] argFNRoles;
374 
375         private final int coreferenceEntityArg;
376 
377         private final int coreferencePredicateArg;
378 
379         @Nullable
380         private List<Integer> argNums;
381 
382         Roleset(final String id, final String lemma, final String descr,
383                 final Iterable<String> argDescr) {
384             this(id, lemma, descr, null, null, null, argDescr, null, null, -1, -1);
385         }
386 
387         Roleset(final String id, final String lemma, final String descr,
388                 final Iterable<String> vnFrames, final Iterable<String> fnFrames,
389                 final Iterable<String> eventTypes, final Iterable<String> argDescr,
390                 final Iterable<? extends Iterable<String>> argVNRoles,
391                 final Iterable<? extends Iterable<String>> argFNRoles,
392                 final int coreferenceEntityArg, final int coreferencePredicateArg) {
393 
394             this.id = id;
395             this.lemma = (String) INTERNER.intern(lemma);
396             this.descr = descr;
397             this.vnFrames = internList(vnFrames);
398             this.fnFrames = internList(fnFrames);
399             this.eventTypes = internList(eventTypes);
400             this.argDescr = Iterables.toArray(argDescr, String.class);
401             this.argVNRoles = internListArray(argVNRoles);
402             this.argFNRoles = internListArray(argFNRoles);
403             this.argNums = null;
404             this.coreferenceEntityArg = coreferenceEntityArg;
405             this.coreferencePredicateArg = coreferencePredicateArg;
406         }
407 
408         public String getID() {
409             return this.id;
410         }
411 
412         public String getLemma() {
413             return this.lemma;
414         }
415 
416         public String getDescr() {
417             return this.descr;
418         }
419 
420         public List<String> getVNFrames() {
421             return this.vnFrames;
422         }
423 
424         public List<String> getFNFrames() {
425             return this.fnFrames;
426         }
427 
428         public List<String> getEventTypes() {
429             return this.eventTypes;
430         }
431 
432         @SuppressWarnings("unchecked")
433         public List<Integer> getArgNums() {
434             if (this.argNums == null) {
435                 final ImmutableList.Builder<Integer> builder = ImmutableList.builder();
436                 for (int i = 0; i < this.argDescr.length; ++i) {
437                     if (!Strings.isNullOrEmpty(this.argDescr[i])) {
438                         builder.add(i);
439                     }
440                 }
441                 this.argNums = (List<Integer>) INTERNER.intern(builder.build());
442             }
443             return this.argNums;
444         }
445 
446         public String getArgDescr(final int argNum) {
447             return this.argDescr[argNum];
448         }
449 
450         public List<String> getArgVNRoles(final int argNum) {
451             return argNum < this.argVNRoles.length ? this.argVNRoles[argNum] : ImmutableList
452                     .<String>of();
453         }
454 
455         public List<String> getArgFNRoles(final int argNum) {
456             return argNum < this.argFNRoles.length ? this.argFNRoles[argNum] : ImmutableList
457                     .<String>of();
458         }
459 
460         public int getCoreferenceEntityArg() {
461             return this.coreferenceEntityArg;
462         }
463 
464         public int getCoreferencePredicateArg() {
465             return this.coreferencePredicateArg;
466         }
467 
468         @Override
469         public String toString() {
470             return this.id;
471         }
472 
473         @SuppressWarnings("unchecked")
474         private static List<String> internList(@Nullable final Iterable<String> strings) {
475             List<String> list = Lists.newArrayList();
476             if (strings != null) {
477                 for (final String string : strings) {
478                     if (string != null) {
479                         list.add((String) INTERNER.intern(string));
480                     }
481                 }
482             }
483             Collections.sort(list);
484             list = ImmutableList.copyOf(list);
485             return (List<String>) INTERNER.intern(list);
486         }
487 
488         @SuppressWarnings({ "unchecked" })
489         private static List<String>[] internListArray(
490                 @Nullable final Iterable<? extends Iterable<String>> stringLists) {
491             final List<List<String>> list = Lists.newArrayList();
492             if (stringLists != null) {
493                 for (final Iterable<String> stringList : stringLists) {
494                     list.add(internList(stringList));
495                 }
496             }
497             return list.toArray(new List[list.size()]);
498         }
499 
500     }
501 
502 }