1 package eu.fbk.dkm.pikes.resources;
2
3 import com.google.common.base.Charsets;
4 import com.google.common.base.Joiner;
5 import com.google.common.base.Splitter;
6 import com.google.common.base.Strings;
7 import com.google.common.collect.*;
8 import com.google.common.io.Resources;
9 import eu.fbk.utils.core.CommandLine;
10 import eu.fbk.utils.core.StaxParser;
11 import org.slf4j.LoggerFactory;
12
13 import javax.annotation.Nullable;
14 import javax.xml.stream.XMLStreamException;
15 import java.io.*;
16 import java.util.Arrays;
17 import java.util.Collections;
18 import java.util.List;
19 import java.util.Map;
20
21 public final class PropBank {
22
23 private static final List<Roleset> ROLESETS;
24
25 private static final Map<String, Roleset> ID_INDEX;
26
27 private static final ListMultimap<String, Roleset> LEMMA_INDEX;
28
29 static {
30 try {
31 final Map<String, int[]> corefMap = Maps.newHashMap();
32 for (final String line : Resources.readLines(
33 PropBank.class.getResource("PropBank.coref"), Charsets.UTF_8)) {
34 final String[] tokens = line.split("\\s+");
35 final int[] roles = new int[] { Integer.parseInt(tokens[1]),
36 Integer.parseInt(tokens[2]) };
37 corefMap.put(tokens[0], roles);
38 }
39
40 final Map<String, Roleset> idIndex = Maps.newLinkedHashMap();
41 final ListMultimap<String, Roleset> lemmaIndex = ArrayListMultimap.create();
42
43 final BufferedReader reader = Resources.asCharSource(
44 PropBank.class.getResource("PropBank.tsv"), Charsets.UTF_8)
45 .openBufferedStream();
46
47 String line;
48 while ((line = reader.readLine()) != null) {
49
50
51 final String[] tokens = Iterables.toArray(Splitter.on('\t').split(line),
52 String.class);
53 final String id = tokens[0];
54 final String lemma = tokens[1];
55 final String name = tokens[2];
56 final List<String> vnFrames = Splitter.on('|').splitToList(tokens[3]);
57 final List<String> fnFrames = Splitter.on('|').splitToList(tokens[4]);
58 final List<String> eventTypes = Splitter.on('|').splitToList(tokens[5]);
59
60
61 final List<String> argDescr = Lists.newArrayList();
62 final List<List<String>> argVNRoles = Lists.newArrayList();
63 final List<List<String>> argFNRoles = Lists.newArrayList();
64 for (int i = 0; i < 6; ++i) {
65 argDescr.add(null);
66 argVNRoles.add(null);
67 argFNRoles.add(null);
68 }
69 for (int i = 6; i + 3 < tokens.length; i += 4) {
70 final int num = Integer.parseInt(tokens[i]);
71 argDescr.set(num, tokens[i + 1]);
72 argVNRoles.set(num, Splitter.on('|').splitToList(tokens[i + 2]));
73 argFNRoles.set(num, Splitter.on('|').splitToList(tokens[i + 3]));
74 }
75
76
77 final int[] corefRoles = corefMap.get(id);
78 final int entityRole = corefRoles == null ? -1 : corefRoles[0];
79 final int predicateRole = corefRoles == null ? -1 : corefRoles[1];
80 final Roleset roleset = new Roleset(id, lemma, name, vnFrames, fnFrames,
81 eventTypes, argDescr, argVNRoles, argFNRoles, entityRole, predicateRole);
82 idIndex.put(id, roleset);
83 lemmaIndex.put(lemma, roleset);
84 }
85
86 reader.close();
87
88 ROLESETS = ImmutableList.copyOf(idIndex.values());
89 ID_INDEX = ImmutableMap.copyOf(idIndex);
90 LEMMA_INDEX = ImmutableListMultimap.copyOf(lemmaIndex);
91
92 } catch (final IOException ex) {
93 throw new Error("Cannot load eu.fbk.dkm.pikes.resources.PropBank data", ex);
94 }
95 }
96
97 @Nullable
98 public static Roleset getRoleset(@Nullable final String id) {
99 return ID_INDEX.get(id == null ? null : id.toLowerCase());
100 }
101
102 public static List<Roleset> getRolesets(@Nullable final String lemma) {
103 return LEMMA_INDEX.get(lemma == null ? null : lemma.toLowerCase());
104 }
105
106 public static List<Roleset> getRolesets() {
107 return ROLESETS;
108 }
109
110 public static void main(final String[] args) throws IOException, XMLStreamException {
111
112 try {
113 final CommandLine cmd = CommandLine
114 .parser()
115 .withName("PropBankBank")
116 .withHeader(
117 "Generate a TSV file with indexed eu.fbk.dkm.pikes.resources.PropBank data, "
118 + "including mapping to eu.fbk.dkm.pikes.resources.VerbNet and eu.fbk.dkm.pikes.resources.FrameNet from the PredicateMatrix")
119 .withOption("f", "frames", "the directory containing frame definitions",
120 "DIR", CommandLine.Type.DIRECTORY_EXISTING, true, false, true)
121 .withOption("m", "matrix", "the file containing the predicate matrix", "FILE",
122 CommandLine.Type.FILE_EXISTING, true, false, true)
123 .withOption("o", "output", "output file", "FILE", CommandLine.Type.FILE, true, false, true)
124 .withLogger(LoggerFactory.getLogger("eu.fbk.nafview")).parse(args);
125
126 final File dir = cmd.getOptionValue("f", File.class);
127 final File pm = cmd.getOptionValue("m", File.class);
128 final File output = cmd.getOptionValue("o", File.class);
129
130
131 final Matrix matrix = new Matrix(pm);
132
133 final Writer writer = new OutputStreamWriter(new BufferedOutputStream(
134 new FileOutputStream(output)), Charsets.UTF_8);
135
136 final File[] files = dir.listFiles();
137 Arrays.sort(files);
138
139 for (final File file : files) {
140 if (file.getName().endsWith(".xml")) {
141 System.out.println("Processing " + file);
142 final Reader reader = new BufferedReader(new FileReader(file));
143 try {
144 new Parser(reader, matrix).parse(writer);
145 } finally {
146 reader.close();
147 }
148 }
149 }
150
151 } catch (final Throwable ex) {
152 CommandLine.fail(ex);
153 }
154 }
155
156 private static class Matrix {
157
158 final Multimap<String, String> vnFrames;
159
160 final Multimap<String, String> fnFrames;
161
162 final Multimap<String, String> eventTypes;
163
164 final Multimap<String, String> vnRoles;
165
166 final Multimap<String, String> fnRoles;
167
168 Matrix(final File file) throws IOException {
169
170 this.vnFrames = HashMultimap.create();
171 this.fnFrames = HashMultimap.create();
172 this.eventTypes = HashMultimap.create();
173 this.vnRoles = HashMultimap.create();
174 this.fnRoles = HashMultimap.create();
175
176 parseMatrix(file);
177 }
178
179 private void parseMatrix(final File matrixFile) throws IOException {
180
181 final BufferedReader in = new BufferedReader(new InputStreamReader(
182 new FileInputStream(matrixFile), Charsets.UTF_8));
183
184 try {
185
186 String line;
187 while ((line = in.readLine()) != null) {
188
189
190 final String[] tokens = line.split("\t");
191 if (tokens.length <= 18) {
192 continue;
193 }
194
195
196 final String pbFrame = parseMatrixValue(tokens[11]);
197 if (pbFrame == null) {
198 continue;
199 }
200 final String pbRole = parseMatrixValue(tokens[12]);
201 final String pbFrameRole = pbFrame + pbRole;
202
203
204 final String vnClass = parseMatrixValue(tokens[0]);
205 final String vnSubClass = parseMatrixValue(tokens[2]);
206 final String vnFrame = vnSubClass != null ? vnSubClass : vnClass;
207 final String vnRole = parseMatrixValue(tokens[5]);
208 if (vnSubClass != null && vnClass != null && !vnSubClass.startsWith(vnClass)) {
209 System.err.println("Unexpected VN class / subclass pair: " + vnClass
210 + ", " + vnSubClass);
211 }
212 if (vnFrame != null) {
213 this.vnFrames.put(pbFrame, vnFrame);
214 if (vnRole != null) {
215 this.vnRoles.put(pbFrameRole, vnRole);
216 }
217 }
218
219
220 final String fnFrame = parseMatrixValue(tokens[8]);
221 final String fnRole = parseMatrixValue(tokens[10]);
222 if (fnFrame != null) {
223 this.fnFrames.put(pbFrame, fnFrame);
224 if (fnRole != null) {
225 this.fnRoles.put(pbFrameRole, fnRole);
226 }
227 }
228
229
230 final String eventType = parseMatrixValue(tokens[17]);
231 if (eventType != null) {
232 this.eventTypes.put(pbFrame, eventType);
233 }
234 }
235 } finally {
236 in.close();
237 }
238 }
239
240 @Nullable
241 private static String parseMatrixValue(@Nullable String string) {
242
243 if (string != null) {
244
245
246 final int index = string.indexOf(':');
247 if (index > 0) {
248 string = string.substring(index + 1);
249 }
250
251
252 if (!"NULL".equalsIgnoreCase(string)) {
253 return string;
254 }
255 }
256 return null;
257 }
258
259 }
260
261 private static class Parser extends StaxParser {
262
263 private final Matrix matrix;
264
265 Parser(final Reader reader, @Nullable final Matrix matrix) throws IOException {
266 super(reader);
267 this.matrix = matrix;
268 }
269
270 void parse(final Writer writer) throws IOException, XMLStreamException {
271 enter("frameset");
272 while (tryEnter("predicate")) {
273
274
275 final String lemma = attribute("lemma").trim().replace('_', ' ').toLowerCase();
276
277
278 while (tryEnter("roleset")) {
279
280
281 final String id = attribute("id").trim();
282 final String name = attribute("name").trim();
283
284
285 final String vnFrames = Joiner.on('|').join(
286 Ordering.natural().sortedCopy(this.matrix.vnFrames.get(id)));
287 final String fnFrames = Joiner.on('|').join(
288 Ordering.natural().sortedCopy(this.matrix.fnFrames.get(id)));
289 final String eventTypes = Joiner.on('|').join(
290 Ordering.natural().sortedCopy(this.matrix.eventTypes.get(id)));
291
292
293 writer.write(id);
294 writer.write('\t');
295 writer.write(lemma);
296 writer.write('\t');
297 writer.write(name);
298 writer.write('\t');
299 writer.write(vnFrames);
300 writer.write('\t');
301 writer.write(fnFrames);
302 writer.write('\t');
303 writer.write(eventTypes);
304
305
306 if (tryEnter("roles")) {
307 while (tryEnter("role")) {
308 try {
309
310
311 final int n = Integer.parseInt(attribute("n"));
312 final String descr = attribute("descr").trim();
313
314
315 final String roleId = id + n;
316 final String vnRoles = Joiner.on('|').join(
317 Ordering.natural().sortedCopy(
318 this.matrix.vnRoles.get(roleId)));
319 final String fnRoles = Joiner.on('|').join(
320 Ordering.natural().sortedCopy(
321 this.matrix.fnRoles.get(roleId)));
322
323
324 writer.write('\t');
325 writer.write(Integer.toString(n));
326 writer.write('\t');
327 writer.write(Strings.nullToEmpty(descr));
328 writer.write('\t');
329 writer.write(vnRoles);
330 writer.write('\t');
331 writer.write(fnRoles);
332
333 } catch (final NumberFormatException ex) {
334
335 }
336 leave();
337 }
338 leave();
339 }
340
341
342 writer.write('\n');
343 writer.flush();
344 leave();
345 }
346 leave();
347 }
348 leave();
349 }
350
351 }
352
353 public static final class Roleset {
354
355 private static final Interner<Object> INTERNER = Interners.newStrongInterner();
356
357 private final String id;
358
359 private final String lemma;
360
361 private final String descr;
362
363 private final List<String> vnFrames;
364
365 private final List<String> fnFrames;
366
367 private final List<String> eventTypes;
368
369 private final String[] argDescr;
370
371 private final List<String>[] argVNRoles;
372
373 private final List<String>[] argFNRoles;
374
375 private final int coreferenceEntityArg;
376
377 private final int coreferencePredicateArg;
378
379 @Nullable
380 private List<Integer> argNums;
381
382 Roleset(final String id, final String lemma, final String descr,
383 final Iterable<String> argDescr) {
384 this(id, lemma, descr, null, null, null, argDescr, null, null, -1, -1);
385 }
386
387 Roleset(final String id, final String lemma, final String descr,
388 final Iterable<String> vnFrames, final Iterable<String> fnFrames,
389 final Iterable<String> eventTypes, final Iterable<String> argDescr,
390 final Iterable<? extends Iterable<String>> argVNRoles,
391 final Iterable<? extends Iterable<String>> argFNRoles,
392 final int coreferenceEntityArg, final int coreferencePredicateArg) {
393
394 this.id = id;
395 this.lemma = (String) INTERNER.intern(lemma);
396 this.descr = descr;
397 this.vnFrames = internList(vnFrames);
398 this.fnFrames = internList(fnFrames);
399 this.eventTypes = internList(eventTypes);
400 this.argDescr = Iterables.toArray(argDescr, String.class);
401 this.argVNRoles = internListArray(argVNRoles);
402 this.argFNRoles = internListArray(argFNRoles);
403 this.argNums = null;
404 this.coreferenceEntityArg = coreferenceEntityArg;
405 this.coreferencePredicateArg = coreferencePredicateArg;
406 }
407
408 public String getID() {
409 return this.id;
410 }
411
412 public String getLemma() {
413 return this.lemma;
414 }
415
416 public String getDescr() {
417 return this.descr;
418 }
419
420 public List<String> getVNFrames() {
421 return this.vnFrames;
422 }
423
424 public List<String> getFNFrames() {
425 return this.fnFrames;
426 }
427
428 public List<String> getEventTypes() {
429 return this.eventTypes;
430 }
431
432 @SuppressWarnings("unchecked")
433 public List<Integer> getArgNums() {
434 if (this.argNums == null) {
435 final ImmutableList.Builder<Integer> builder = ImmutableList.builder();
436 for (int i = 0; i < this.argDescr.length; ++i) {
437 if (!Strings.isNullOrEmpty(this.argDescr[i])) {
438 builder.add(i);
439 }
440 }
441 this.argNums = (List<Integer>) INTERNER.intern(builder.build());
442 }
443 return this.argNums;
444 }
445
446 public String getArgDescr(final int argNum) {
447 return this.argDescr[argNum];
448 }
449
450 public List<String> getArgVNRoles(final int argNum) {
451 return argNum < this.argVNRoles.length ? this.argVNRoles[argNum] : ImmutableList
452 .<String>of();
453 }
454
455 public List<String> getArgFNRoles(final int argNum) {
456 return argNum < this.argFNRoles.length ? this.argFNRoles[argNum] : ImmutableList
457 .<String>of();
458 }
459
460 public int getCoreferenceEntityArg() {
461 return this.coreferenceEntityArg;
462 }
463
464 public int getCoreferencePredicateArg() {
465 return this.coreferencePredicateArg;
466 }
467
468 @Override
469 public String toString() {
470 return this.id;
471 }
472
473 @SuppressWarnings("unchecked")
474 private static List<String> internList(@Nullable final Iterable<String> strings) {
475 List<String> list = Lists.newArrayList();
476 if (strings != null) {
477 for (final String string : strings) {
478 if (string != null) {
479 list.add((String) INTERNER.intern(string));
480 }
481 }
482 }
483 Collections.sort(list);
484 list = ImmutableList.copyOf(list);
485 return (List<String>) INTERNER.intern(list);
486 }
487
488 @SuppressWarnings({ "unchecked" })
489 private static List<String>[] internListArray(
490 @Nullable final Iterable<? extends Iterable<String>> stringLists) {
491 final List<List<String>> list = Lists.newArrayList();
492 if (stringLists != null) {
493 for (final Iterable<String> stringList : stringLists) {
494 list.add(internList(stringList));
495 }
496 }
497 return list.toArray(new List[list.size()]);
498 }
499
500 }
501
502 }