1 package eu.fbk.dkm.pikes.tintop.util.framenet;
2
3 import ch.qos.logback.classic.Level;
4 import com.google.common.base.Charsets;
5 import com.google.common.collect.HashMultimap;
6 import com.google.common.io.Files;
7 import eu.fbk.dkm.pikes.resources.FrameBase;
8 import eu.fbk.dkm.pikes.resources.WordNet;
9 import eu.fbk.fcw.utils.corpus.Corpus;
10 import eu.fbk.fcw.utils.corpus.Sentence;
11 import eu.fbk.fcw.utils.corpus.Srl;
12 import eu.fbk.fcw.utils.corpus.Word;
13 import eu.fbk.dkm.pikes.resources.util.fnlu.*;
14 import eu.fbk.dkm.pikes.resources.util.onsenses.Inventory;
15 import eu.fbk.dkm.pikes.resources.util.onsenses.Sense;
16 import eu.fbk.dkm.pikes.resources.util.onsenses.Wn;
17 import eu.fbk.dkm.pikes.resources.util.propbank.*;
18 import eu.fbk.dkm.pikes.resources.util.semlink.vnfn.SemLinkRoot;
19 import eu.fbk.dkm.pikes.resources.util.semlink.vnfnroles.Role;
20 import eu.fbk.dkm.pikes.resources.util.semlink.vnfnroles.SemLinkRolesRoot;
21 import eu.fbk.dkm.pikes.resources.util.semlink.vnfnroles.Vncls;
22 import eu.fbk.dkm.pikes.resources.util.semlink.vnpb.Argmap;
23 import eu.fbk.dkm.pikes.resources.util.semlink.vnpb.PbvnTypemap;
24 import eu.fbk.utils.core.CommandLine;
25 import eu.fbk.utils.core.FrequencyHashSet;
26 import net.didion.jwnl.data.PointerType;
27 import org.eclipse.rdf4j.model.IRI;
28 import org.slf4j.Logger;
29 import org.slf4j.LoggerFactory;
30
31 import javax.xml.bind.JAXBContext;
32 import javax.xml.bind.JAXBException;
33 import javax.xml.bind.Unmarshaller;
34 import java.io.*;
35 import java.util.*;
36 import java.util.regex.Matcher;
37 import java.util.regex.Pattern;
38
39
40
41
42
43
44
45
46
47
48
49 public class MergeMateFramenet {
50
51
52
53 private static final Logger LOGGER = LoggerFactory.getLogger(MergeMateFramenet.class);
54 private static HashMap<String, String> lemmaToTransform = new HashMap();
55 static final Pattern ONTONOTES_FILENAME_PATTERN = Pattern.compile("(.*)-([a-z]+)\\.xml");
56 static final Pattern FRAMEBASE_PATTERN = Pattern
57 .compile("^[^\\s]+\\s+[^\\s]+\\s+([^\\s]+)\\s+-\\s+(.+)\\s+-\\s+([a-z])#([0-9]+)$");
58 static final Pattern PB_PATTERN = Pattern.compile("^verb-((.*)\\.[0-9]+)$");
59
60
61
62 static final Pattern FB_PREDICATES = Pattern.compile("^frame-(Chemical-sense_description|Non-commutative_process|Non-commutative_statement|[^-]*)-(.*)\\.([a-z]+)$");
63
64 static final Pattern FB_ROLES = Pattern.compile("^fe-(.*)-(.*)$");
65
66 public enum OutputMapping {
67 PBauto, NBauto, NBresource, PBtrivial
68 }
69
70 static {
71 lemmaToTransform.put("cry+down(e)", "cry+down");
72 }
73
74
75
76
77
78
79
80
81 protected static String getLemmaFromPredicateName(String lemmaFromPredicate) {
82 String lemma = lemmaFromPredicate.replace('_', '+')
83 .replace(' ', '+');
84 if (lemmaToTransform.keySet().contains(lemma)) {
85 lemma = lemmaToTransform.get(lemma);
86 }
87 return lemma;
88 }
89
90
91
92
93
94
95
96 private static Collection<String> getIntersection(Collection<String>... collections) {
97 return getIntersection(true, collections);
98 }
99
100
101
102
103
104
105
106
107 private static Collection<String> getIntersection(boolean ignoreEmptySets, Collection<String>... collections) {
108 Collection<String> ret = null;
109 for (Collection<String> collection : collections) {
110 if (ignoreEmptySets && (collection == null || collection.size() == 0)) {
111 continue;
112 }
113 if (ret == null) {
114 ret = new HashSet<>();
115 ret.addAll(collection);
116 } else {
117 ret.retainAll(collection);
118 }
119 }
120
121 if (ret == null) {
122 ret = new HashSet<>();
123 }
124 return ret;
125 }
126
127 private static ArrayList<Matcher> getPropBankPredicates(Roleset roleset) {
128
129 ArrayList<Matcher> ret = new ArrayList<>();
130
131 String source = roleset.getSource();
132 if (source != null && source.length() > 0) {
133
134 String[] parts = source.split("\\s+");
135 for (String part : parts) {
136 if (part.trim().length() == 0) {
137 continue;
138 }
139
140 Matcher matcher = PB_PATTERN.matcher(source);
141 if (!matcher.find()) {
142 continue;
143 }
144
145 ret.add(matcher);
146 }
147 }
148
149 return ret;
150 }
151
152 public static void main(String[] args) {
153 try {
154 final CommandLine cmd = CommandLine
155 .parser()
156 .withName("./merger")
157 .withHeader("Transform linguistic resources into RDF")
158 .withOption("p", "propbank", "PropBank folder", "FOLDER",
159 CommandLine.Type.DIRECTORY_EXISTING, true, false, true)
160 .withOption("w", "wordnet", "WordNet folder", "FOLDER", CommandLine.Type.DIRECTORY_EXISTING, true,
161 false, true)
162 .withOption("o", "ontonotes", "Ontonotes senses folder", "FOLDER",
163 CommandLine.Type.DIRECTORY_EXISTING, true, false, true)
164 .withOption("l", "lu", "FrameNet LU folder", "FOLDER",
165 CommandLine.Type.DIRECTORY_EXISTING, true, false, true)
166 .withOption(null, "lu-parsed", "FrameNet LU folder (parsed, in CoNLL format)", "FOLDER",
167 CommandLine.Type.DIRECTORY_EXISTING, true, false, true)
168 .withOption("f", "framebase", "FrameBase FrameNet-WordNet map", "FILE",
169 CommandLine.Type.FILE_EXISTING, true, false, true)
170 .withOption("s", "semlink", "SemLink folder", "FOLDER",
171 CommandLine.Type.DIRECTORY_EXISTING, true, false, true)
172 .withOption("n", "nombank", "NomBank folder", "FOLDER",
173 CommandLine.Type.DIRECTORY_EXISTING, true, false, true)
174 .withOption("c", "close-match", "closeMatch file from PreMOn", "FILE",
175 CommandLine.Type.FILE_EXISTING, true, false, true)
176 .withOption(null, "ignore-lemma", "ignore lemma information")
177 .withOption(null, "save-files", "serialize big files")
178 .withOption(null, "print-pr-table", "print precision/recall table")
179 .withOption("O", "output", "Output file prefix", "PREFIX",
180 CommandLine.Type.STRING, true, false, true)
181 .withOption(null, "enable-sl4p",
182 "Enable extraction of frames using SemLink when framnet argument of roleset is empty")
183 .withLogger(LoggerFactory.getLogger("eu.fbk")).parse(args);
184
185 ((ch.qos.logback.classic.Logger) LOGGER).setLevel(Level.INFO);
186
187 File pbFolder = cmd.getOptionValue("propbank", File.class);
188 File nbFolder = cmd.getOptionValue("nombank", File.class);
189 File wordnetFolder = cmd.getOptionValue("wordnet", File.class);
190 File ontonotesFolder = cmd.getOptionValue("ontonotes", File.class);
191 File framebaseFile = cmd.getOptionValue("framebase", File.class);
192 File closeMatchFile = cmd.getOptionValue("close-match", File.class);
193 File luFolder = cmd.getOptionValue("lu", File.class);
194 File luParsedFolder = cmd.getOptionValue("lu-parsed", File.class);
195 File semlinkFolder = cmd.getOptionValue("semlink", File.class);
196
197 String outputPattern = cmd.getOptionValue("output", String.class);
198
199 boolean enableSemLinkForPredicates = cmd.hasOption("enable-sl4p");
200 boolean saveFiles = cmd.hasOption("save-files");
201 boolean printPRTable = cmd.hasOption("print-pr-table");
202
203 boolean ignoreLemmaInFrameBaseMappings = cmd.hasOption("ignore-lemma");
204
205
206
207 Integer max = null;
208
209 WordNet.setPath(wordnetFolder.getAbsolutePath());
210 WordNet.init();
211
212 JAXBContext fnContext = JAXBContext.newInstance(Frameset.class);
213 Unmarshaller fnUnmarshaller = fnContext.createUnmarshaller();
214
215 JAXBContext onContext = JAXBContext.newInstance(Inventory.class);
216 Unmarshaller onUnmarshaller = onContext.createUnmarshaller();
217
218 JAXBContext luContext = JAXBContext.newInstance(LexUnit.class);
219 Unmarshaller luUnmarshaller = luContext.createUnmarshaller();
220
221 JAXBContext semlinkContext = JAXBContext.newInstance(SemLinkRoot.class);
222 Unmarshaller semlinkUnmarshaller = semlinkContext.createUnmarshaller();
223
224 JAXBContext semlinkRolesContext = JAXBContext.newInstance(SemLinkRolesRoot.class);
225 Unmarshaller semlinkRolesUnmarshaller = semlinkRolesContext.createUnmarshaller();
226
227 JAXBContext semlinkPbContext = JAXBContext.newInstance(PbvnTypemap.class);
228 Unmarshaller semlinkPbUnmarshaller = semlinkPbContext.createUnmarshaller();
229
230 BufferedWriter writer, writerFrames, writerRoles;
231 File outputFile, outputFileFrames, outputFileRoles;
232 outputFileFrames = new File(outputPattern + "-frames-ok.tsv");
233 outputFileRoles = new File(outputPattern + "-roles-ok.tsv");
234 writerFrames = new BufferedWriter(new FileWriter(outputFileFrames));
235 writerRoles = new BufferedWriter(new FileWriter(outputFileRoles));
236
237
238
239
240 for (String predicate : FrameBase.getPredicatesSet()) {
241 Matcher matcher = FB_PREDICATES.matcher(predicate.trim());
242 if (!matcher.find()) {
243 LOGGER.error("{} is not correctly formatted", predicate);
244 continue;
245 }
246
247 IRI fbIRI = FrameBase.uriFor(predicate);
248 if (fbIRI == null) {
249 LOGGER.error("This should never happen!");
250 LOGGER.debug(predicate);
251 break;
252 }
253
254 String lemma = matcher.group(2).toLowerCase();
255 lemma = lemma.replaceAll("\\(\\(.*\\)\\)", "");
256 lemma = lemma.replaceAll("\\(", "");
257 lemma = lemma.replaceAll("\\)", "");
258 lemma = lemma.replace('_', ' ');
259 lemma = lemma.trim();
260 lemma = lemma.replace(' ', '_');
261
262 writerFrames.append("fn:");
263 writerFrames.append(matcher.group(1).toLowerCase()).append('\t');
264 writerFrames.append(lemma).append('\t');
265 writerFrames.append(matcher.group(3).toLowerCase()).append('\t');
266 writerFrames.append(fbIRI.toString()).append('\n');
267 }
268 for (String role : FrameBase.getRolesSet()) {
269 Matcher matcher = FB_ROLES.matcher(role.trim());
270 if (!matcher.find()) {
271 LOGGER.error("{} is not correctly formatted", role);
272 continue;
273 }
274
275 IRI fbIRI = FrameBase.uriFor(role);
276 if (fbIRI == null) {
277 LOGGER.error("This should never happen!");
278 LOGGER.debug(role);
279 break;
280 }
281
282 String roleAt = matcher.group(1) + "@" + matcher.group(2);
283 roleAt = roleAt.toLowerCase();
284
285 writerRoles.append("fn:");
286 writerRoles.append(roleAt).append('\t');
287 writerRoles.append(fbIRI.toString()).append('\n');
288 }
289
290
291
292
293
294
295
296
297 LOGGER.info("Loading closeMatches");
298 HashMap<String, HashMap<String, String>> nomBankToProbBankRoles = new HashMap<>();
299 Pattern CLOSEMATCH_PATTERN = Pattern.compile("nb10-(.*?)-arg([0-9])>.*pbon5-(.*?)-arg([0-9])>");
300 List<String> closeMatchLines = Files.readLines(closeMatchFile, Charsets.UTF_8);
301 for (String line : closeMatchLines) {
302 line = line.trim();
303 Matcher matcher = CLOSEMATCH_PATTERN.matcher(line);
304 if (matcher.find()) {
305 if (matcher.group(2).equals(matcher.group(4))) {
306 continue;
307 }
308
309 String nbPredicate = matcher.group(1);
310
311 if (!nomBankToProbBankRoles.containsKey(nbPredicate)) {
312 nomBankToProbBankRoles.put(nbPredicate, new HashMap<>());
313 }
314
315 nomBankToProbBankRoles.get(nbPredicate).put(matcher.group(2), matcher.group(4));
316 }
317 }
318
319
320
321
322 LOGGER.info("Loading SemLink");
323 File semlinkFile;
324
325 semlinkFile = new File(semlinkFolder.getAbsolutePath() + File.separator + "vn-pb" + File.separator
326 + "vnpbMappings");
327 PbvnTypemap semLinkPb = (PbvnTypemap) semlinkPbUnmarshaller.unmarshal(semlinkFile);
328
329 HashMultimap<String, String> verbnetToPropbank = HashMultimap.create();
330 HashMultimap<String, String> propbankToVerbnet = HashMultimap.create();
331
332 for (eu.fbk.dkm.pikes.resources.util.semlink.vnpb.Predicate predicate : semLinkPb.getPredicate()) {
333 String lemma = predicate.getLemma();
334 Argmap argmap = predicate.getArgmap();
335 if (argmap == null) {
336 continue;
337 }
338
339 String pbFrame = argmap.getPbRoleset().toLowerCase();
340 String vnClass = argmap.getVnClass().toLowerCase();
341
342 verbnetToPropbank.put(vnClass, pbFrame);
343 propbankToVerbnet.put(pbFrame, vnClass);
344
345 for (eu.fbk.dkm.pikes.resources.util.semlink.vnpb.Role role : argmap.getRole()) {
346 String pbArg = pbFrame + "@" + role.getPbArg().toLowerCase();
347 String vnTheta = vnClass + "@" + role.getVnTheta().toLowerCase();
348
349 verbnetToPropbank.put(vnTheta, pbArg);
350 propbankToVerbnet.put(pbArg, vnTheta);
351 }
352
353 }
354
355 semlinkFile = new File(semlinkFolder.getAbsolutePath() + File.separator + "vn-fn" + File.separator
356 + "VN-FNRoleMapping.txt");
357 SemLinkRolesRoot semLinkRoles = (SemLinkRolesRoot) semlinkRolesUnmarshaller.unmarshal(semlinkFile);
358
359 HashMultimap<String, String> verbnetToFramenet = HashMultimap.create();
360 HashMultimap<String, String> framenetToVerbnet = HashMultimap.create();
361
362 for (Vncls vncls : semLinkRoles.getVncls()) {
363 String frame = vncls.getFnframe().toLowerCase();
364 String vnClass = vncls.getClazz().toLowerCase();
365
366 verbnetToFramenet.put(vnClass, frame);
367 framenetToVerbnet.put(frame, vnClass);
368
369 if (vncls.getRoles() == null) {
370 continue;
371 }
372
373 for (Role role : vncls.getRoles().getRole()) {
374 String fnRole = frame + "@" + role.getFnrole().toLowerCase();
375 String vnRole = vnClass + "@" + role.getVnrole().toLowerCase();
376
377 verbnetToFramenet.put(vnRole, fnRole);
378 framenetToVerbnet.put(fnRole, vnRole);
379 }
380 }
381
382 semlinkFile = new File(
383 semlinkFolder.getAbsolutePath() + File.separator + "vn-fn" + File.separator + "VNC-FNF.s");
384 SemLinkRoot semLink = (SemLinkRoot) semlinkUnmarshaller.unmarshal(semlinkFile);
385
386 for (eu.fbk.dkm.pikes.resources.util.semlink.vnfn.Vncls vncls : semLink.getVncls()) {
387 String vnClass = vncls.getClazz().toLowerCase();
388 String frame = vncls.getFnframe().toLowerCase();
389
390 verbnetToFramenet.put(vnClass, frame);
391 framenetToVerbnet.put(frame, vnClass);
392 }
393
394
395
396
397 int nbSource = 0;
398
399 LOGGER.info("Loading NomBank files");
400 HashMultimap<String, Roleset> nbFrames = HashMultimap.create();
401 HashSet<Roleset> nbUnlinked = new HashSet<>();
402 for (File file : Files.fileTreeTraverser().preOrderTraversal(nbFolder)) {
403
404 if (!file.isFile()) {
405 continue;
406 }
407
408 if (!file.getName().endsWith(".xml")) {
409 continue;
410 }
411
412 LOGGER.debug(file.getName());
413
414 Frameset frameset = (Frameset) fnUnmarshaller.unmarshal(file);
415 List<Object> noteOrPredicate = frameset.getNoteOrPredicate();
416 for (Object predicate : noteOrPredicate) {
417 if (predicate instanceof Predicate) {
418 String lemma = ((Predicate) predicate).getLemma();
419 List<Object> noteOrRoleset = ((Predicate) predicate).getNoteOrRoleset();
420 for (Object roleset : noteOrRoleset) {
421 if (roleset instanceof Roleset) {
422
423
424 ((Roleset) roleset).setName(lemma);
425
426 ArrayList<Matcher> predicates = getPropBankPredicates((Roleset) roleset);
427 for (Matcher matcher : predicates) {
428 String pb = matcher.group(1);
429 nbFrames.put(pb, (Roleset) roleset);
430 nbSource++;
431 }
432
433 if (predicates.size() == 0) {
434 nbUnlinked.add((Roleset) roleset);
435 }
436 }
437 }
438 }
439 }
440 }
441
442 LOGGER.info("Loaded {} rolesets with source", nbSource);
443 LOGGER.info("Loaded {} frames without source", nbUnlinked.size());
444
445
446
447
448 LOGGER.info("Loading LU files");
449 int i = 0;
450 HashMap<String, HashMultimap<String, String>> lus = new HashMap<>();
451 HashSet<String> existingFrames = new HashSet<>();
452 List<Sentence> exampleSentences = new ArrayList<>();
453
454 File existingFramesFile = new File(outputPattern + "-lu-existingFrames.ser");
455 File lusFile = new File(outputPattern + "-lu-lus.ser");
456 File exampleSentencesFile = new File(outputPattern + "-lu-exampleSentences.ser");
457 if (existingFramesFile.exists() && lusFile.exists() && exampleSentencesFile.exists()) {
458 LOGGER.info("Loading data from files");
459 existingFrames = (HashSet<String>) loadObjectFromFile(existingFramesFile);
460 lus = (HashMap<String, HashMultimap<String, String>>) loadObjectFromFile(lusFile);
461 exampleSentences = (List<Sentence>) loadObjectFromFile(exampleSentencesFile);
462 } else {
463 for (File file : Files.fileTreeTraverser().preOrderTraversal(luFolder)) {
464 if (!file.isFile()) {
465 continue;
466 }
467
468 if (!file.getName().endsWith(".xml")) {
469 continue;
470 }
471
472 LOGGER.debug(file.getName());
473 i++;
474 if (max != null && i > max) {
475 break;
476 }
477
478 LexUnit lexUnit = (LexUnit) luUnmarshaller.unmarshal(file);
479 String lemma = "";
480 POSType posType = lexUnit.getPOS();
481 for (LexemeType lexeme : lexUnit.getLexeme()) {
482 lemma = lemma + " " + lexeme.getName();
483 }
484 lemma = lemma.trim();
485
486 if (lemma.length() == 0 || posType == null) {
487 LOGGER.error("Lemma or POS null ({}/{})", lemma, posType);
488 continue;
489 }
490 String pos = posType.toString().toLowerCase();
491 String frame = lexUnit.getFrame().toLowerCase();
492
493
494 Corpus corpus = null;
495 File parsedFile = new File(luParsedFolder + File.separator + file.getName() + ".conll");
496 if (parsedFile.exists()) {
497 corpus = Corpus.readDocumentFromFile(parsedFile.getAbsolutePath(), "mate");
498 }
499
500
501 int exampleNo = 0;
502 if (corpus != null) {
503 for (SubCorpusType subCorpus : lexUnit.getSubCorpus()) {
504 for (SentenceType sentence : subCorpus.getSentence()) {
505 String text = sentence.getText();
506 if (text != null && text.length() > 0) {
507
508 Sentence conllSentence = corpus.getSentences().get(exampleNo++);
509
510
511 List<Integer> target = new ArrayList<>();
512 HashMultimap<String, List<Integer>> roles = HashMultimap.create();
513
514 for (AnnotationSetType annotationSet : sentence.getAnnotationSet()) {
515 for (LayerType layer : annotationSet.getLayer()) {
516 String name = layer.getName();
517 if (name.equals("Target")) {
518 for (LabelType label : layer.getLabel()) {
519 target = getSpan(text, label);
520
521
522 break;
523 }
524 }
525 if (name.equals("FE")) {
526 for (LabelType label : layer.getLabel()) {
527 List<Integer> span = getSpan(text, label);
528 if (span == null) {
529 continue;
530 }
531 roles.put(label.getName(), span);
532 }
533
534 }
535 }
536 }
537
538 if (target == null || target.size() == 0) {
539 LOGGER.error("Target not found");
540 continue;
541 }
542
543 try {
544 Integer targetHead = conllSentence.searchHead(target);
545 Srl srl = new Srl(conllSentence.getWords().get(targetHead), frame, "framenet");
546 for (String roleLabel : roles.keySet()) {
547 Set<List<Integer>> spans = roles.get(roleLabel);
548 for (List<Integer> span : spans) {
549 Integer roleHead = conllSentence.searchHead(span);
550 eu.fbk.fcw.utils.corpus.Role role = new eu.fbk.fcw.utils.corpus.Role(
551 conllSentence.getWords().get(roleHead), roleLabel);
552 srl.addRole(role);
553 }
554 }
555 conllSentence.addSrl(srl);
556 } catch (Exception e) {
557
558 LOGGER.error("Error in aligning tokens");
559
560
561
562
563
564
565
566
567
568 }
569
570 exampleSentences.add(conllSentence);
571 }
572 }
573 }
574 }
575
576 existingFrames.add(frame);
577
578
579
580
581
582
583
584
585
586
587 if (lus.get(pos) == null) {
588 lus.put(pos, HashMultimap.create());
589 }
590
591 lus.get(pos).put(lemma, frame);
592 }
593 }
594
595
596
597
598 LOGGER.info("Load FrameBase file");
599 HashMultimap<String, String> fbFramenetToWordNet = HashMultimap.create();
600
601 List<String> lines = Files.readLines(framebaseFile, Charsets.UTF_8);
602 for (String line : lines) {
603 line = line.trim();
604 if (line.length() == 0) {
605 continue;
606 }
607
608 Matcher matcher = FRAMEBASE_PATTERN.matcher(line);
609 if (!matcher.matches()) {
610 continue;
611 }
612
613 String frame = matcher.group(1).toLowerCase();
614 String lemma = matcher.group(2);
615 lemma = getLemmaFromPredicateName(lemma);
616 String wnSynset = WordNet.getSynsetID(Long.parseLong(matcher.group(4)), matcher.group(3));
617
618 String key = getFrameBaseKey(frame, lemma, ignoreLemmaInFrameBaseMappings);
619
620 fbFramenetToWordNet.put(key, wnSynset);
621 }
622
623
624
625
626
627
628
629
630 LOGGER.info("Reading PropBank files");
631 List<RolesetInfo> rolesets = new ArrayList<>();
632 Map<String, String> predicateToLemma = new HashMap<>();
633
634
635 for (File file : Files.fileTreeTraverser().preOrderTraversal(pbFolder)) {
636
637 if (!file.isFile()) {
638 continue;
639 }
640
641 if (!file.getName().endsWith(".xml")) {
642 continue;
643 }
644
645
646 String type;
647 String baseLemma;
648 Matcher matcher = ONTONOTES_FILENAME_PATTERN.matcher(file.getName());
649 if (matcher.matches()) {
650 type = matcher.group(2);
651 baseLemma = matcher.group(1);
652 } else {
653 throw new Exception(
654 "File " + file.getName() + " does not appear to be a good OntoNotes frame file");
655 }
656
657 if (!type.equals("v")) {
658 continue;
659 }
660
661 LOGGER.debug(file.getName());
662
663 HashMap<String, HashMap<String, Set>> senses = getSenses(file.getName(), ontonotesFolder, baseLemma,
664 type, onUnmarshaller);
665
666 Frameset frameset = (Frameset) fnUnmarshaller.unmarshal(file);
667 List<Object> noteOrPredicate = frameset.getNoteOrPredicate();
668
669 for (Object predicate : noteOrPredicate) {
670 if (predicate instanceof Predicate) {
671
672 String lemma = getLemmaFromPredicateName(((Predicate) predicate).getLemma());
673
674 List<String> synsets = WordNet.getSynsetsForLemma(lemma.replace('+', ' '), type);
675
676 Set<String> luFrames = lus.get(type).get(lemma);
677 luFrames.retainAll(existingFrames);
678
679 List<Object> noteOrRoleset = ((Predicate) predicate).getNoteOrRoleset();
680 for (Object roleset : noteOrRoleset) {
681 if (roleset instanceof Roleset) {
682 String rolesetID = ((Roleset) roleset).getId();
683 predicateToLemma.put(rolesetID, lemma.replace('+', '_'));
684
685 RolesetInfo rolesetInfo = new RolesetInfo(file, rolesetID, baseLemma, lemma, type,
686 senses, luFrames, (Roleset) roleset, synsets);
687 rolesets.add(rolesetInfo);
688 }
689 }
690 }
691 }
692 }
693
694
695
696
697 int trivialCount = 0;
698 int nonTrivialCount = 0;
699 int nbCount = 0;
700 int emptyRelatedCount = 0;
701 int nbGreaterCount = 0;
702 int nbZeroCount = 0;
703 int unlinkedCount = 0;
704 int roleMappingCount = 0;
705 int nbRoleMappingCount = 0;
706 int noFrameBaseCount = 0;
707 int semlinkCounter = 0;
708
709 HashMap<OutputMapping, HashMap<String, String>> outputMappingsForPredicates = new HashMap<>();
710 HashMap<OutputMapping, HashMap<String, String>> outputMappingsForPredicatesAdd = new HashMap<>();
711 HashMap<OutputMapping, HashMap<String, String>> outputMappingsForRoles = new HashMap<>();
712 for (OutputMapping outputMapping : OutputMapping.values()) {
713 outputMappingsForPredicates.put(outputMapping, new HashMap<>());
714 outputMappingsForPredicatesAdd.put(outputMapping, new HashMap<>());
715 outputMappingsForRoles.put(outputMapping, new HashMap<>());
716 }
717
718 File frameFile = new File(outputPattern + "-frames.ser");
719 File rolesFile = new File(outputPattern + "-roles.ser");
720 File addFile = new File(outputPattern + "-add.ser");
721
722 if (frameFile.exists() && rolesFile.exists() && addFile.exists()) {
723 LOGGER.info("Loading mappings from files");
724 outputMappingsForPredicates = (HashMap<OutputMapping, HashMap<String, String>>) loadObjectFromFile(
725 frameFile);
726 outputMappingsForRoles = (HashMap<OutputMapping, HashMap<String, String>>) loadObjectFromFile(
727 rolesFile);
728 outputMappingsForPredicatesAdd = (HashMap<OutputMapping, HashMap<String, String>>) loadObjectFromFile(
729 addFile);
730 } else {
731 for (RolesetInfo rolesetInfo : rolesets) {
732
733 Roleset roleset = rolesetInfo.getRoleset();
734 String rolesetID = rolesetInfo.getLabel();
735 HashMap<String, HashMap<String, Set>> senses = rolesetInfo.getSenses();
736 List<String> synsets = rolesetInfo.getSynsets();
737 String lemma = rolesetInfo.getLemma();
738 String baseLemma = rolesetInfo.getBaseLemma();
739 Set<String> luFrames = rolesetInfo.getLuFrames();
740 String type = rolesetInfo.getType();
741
742 String frameNet = roleset.getFramnet();
743
744 if (frameNet != null) {
745 frameNet = frameNet.toLowerCase();
746 }
747
748 LOGGER.debug(rolesetID);
749
750 ArrayList<String> fnFrames = new ArrayList<>();
751 if (frameNet != null) {
752 String[] fns = frameNet.split("\\s+");
753 for (String fn : fns) {
754 if (fn.length() == 0) {
755 continue;
756 }
757 fnFrames.add(fn);
758 }
759 }
760 fnFrames.retainAll(existingFrames);
761
762 if (enableSemLinkForPredicates && fnFrames.size() == 0) {
763 String vnClasses = roleset.getVncls();
764 if (vnClasses != null) {
765 vnClasses = vnClasses.trim();
766 String[] parts = vnClasses.split("\\s+");
767 for (String part : parts) {
768 Set<String> frames = verbnetToFramenet.get(part);
769 if (frames != null) {
770 fnFrames = new ArrayList<>(frames);
771 }
772 }
773 }
774 }
775
776 Collection<String> wnFromSenses = new HashSet<>();
777 Collection<String> fnFromSenses = new HashSet<>();
778 if (senses.get(rolesetID) != null) {
779 wnFromSenses = senses.get(rolesetID).get("wn");
780 fnFromSenses = senses.get(rolesetID).get("fn");
781 }
782 fnFromSenses.retainAll(existingFrames);
783
784
785
786
787 Collection<String> wnCandidates = getIntersection(synsets, wnFromSenses);
788
789 boolean useBaseLemma = false;
790 String lemmaToUse = lemma;
791
792 if (!lemma.equals(baseLemma)) {
793 if (synsets.size() + wnFromSenses.size() == 0) {
794 useBaseLemma = true;
795 }
796 for (String wnCandidate : wnCandidates) {
797 Set<String> lemmas = WordNet.getLemmas(wnCandidate);
798 if (lemmas.contains(baseLemma)) {
799 useBaseLemma = true;
800 }
801 }
802
803 if (useBaseLemma && luFrames.size() != 0) {
804 LOGGER.debug("Base lemma should be used, but lexical unit found ({})",
805 rolesetID);
806 useBaseLemma = false;
807 }
808 }
809
810 Set<String> luFramesToUse = new HashSet<>(luFrames);
811
812 if (useBaseLemma) {
813 LOGGER.debug("Using base lemma");
814 lemmaToUse = baseLemma;
815 luFramesToUse = lus.get(type).get(baseLemma);
816
817 List<String> newSynsets = WordNet
818 .getSynsetsForLemma(baseLemma.replace('+', ' '), type);
819 wnCandidates = getIntersection(wnCandidates, newSynsets);
820 }
821
822 Collection<String> fnCandidates = getIntersection(fnFrames, luFramesToUse,
823 fnFromSenses);
824
825 Collection<String> fnCandidatesOnlySemLink = getIntersection(fnFrames,
826 fnFromSenses);
827 if (fnCandidatesOnlySemLink.size() == 1) {
828 semlinkCounter++;
829 }
830
831 Collection<String> okFrames = getCandidateFrames(wnCandidates, fnCandidates,
832 lemmaToUse,
833 type, fbFramenetToWordNet, ignoreLemmaInFrameBaseMappings);
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849 if (fnCandidatesOnlySemLink.size() == 1 && okFrames.size() == 0) {
850 for (String fnCandidate : fnCandidates) {
851 outputMappingsForPredicatesAdd.get(OutputMapping.PBauto)
852 .put(rolesetID, fnCandidate);
853 noFrameBaseCount++;
854 }
855 }
856
857
858
859 if (okFrames.size() == 1) {
860 for (String okFrame : okFrames) {
861 if (fnFrames.size() == 1 && fnFrames.contains(okFrame)) {
862 trivialCount++;
863 outputMappingsForPredicates.get(OutputMapping.PBtrivial)
864 .put(rolesetID, okFrame);
865 continue;
866 }
867 nonTrivialCount++;
868
869 outputMappingsForPredicates.get(OutputMapping.PBauto)
870 .put(rolesetID, okFrame);
871 }
872 }
873
874
875 Set<Roleset> fRolesets = nbFrames.get(rolesetID);
876 for (Roleset nbRoleset : fRolesets) {
877
878
879 String nbLemma = nbRoleset.getName();
880
881 List<String> nbSynsets = WordNet
882 .getSynsetsForLemma(nbLemma.replace('+', ' '), "n");
883
884 Set<String> relatedSynsets = new HashSet<>();
885 for (String wnCandidate : wnCandidates) {
886 relatedSynsets
887 .addAll(WordNet.getGenericSet(wnCandidate, PointerType.DERIVED,
888 PointerType.NOMINALIZATION, PointerType.PARTICIPLE_OF,
889 PointerType.PERTAINYM));
890 }
891
892 if (relatedSynsets.size() == 0) {
893 emptyRelatedCount++;
894 }
895
896 Set<String> luNbFrames = lus.get("n").get(nbLemma);
897 Collection<String> fnNbCandidates = getIntersection(fnFrames, luFrames,
898 fnFromSenses, luNbFrames);
899
900 Collection<String> nbCandidates = getIntersection(nbSynsets, relatedSynsets);
901 Collection<String> okNbFrames = getCandidateFrames(nbCandidates, fnNbCandidates,
902 nbLemma, "n", fbFramenetToWordNet, ignoreLemmaInFrameBaseMappings);
903
904
905
906 if (okNbFrames.size() == 1) {
907 for (String okFrame : okNbFrames) {
908 nbCount++;
909 outputMappingsForPredicates.get(OutputMapping.NBauto)
910 .put(nbRoleset.getId(), okFrame);
911 }
912 }
913 if (okNbFrames.size() > 1) {
914 nbGreaterCount++;
915 }
916 if (okNbFrames.size() == 0) {
917 nbZeroCount++;
918 }
919 }
920 }
921
922
923 for (Roleset nbRoleset : nbUnlinked) {
924
925
926 String nbLemma = nbRoleset.getName();
927 List<String> nbSynsets = WordNet.getSynsetsForLemma(nbLemma.replace('+', ' '), "n");
928
929 if (nbSynsets.size() == 1) {
930 Set<String> frames = lus.get("n").get(nbLemma);
931 if (frames != null && frames.size() == 1) {
932 for (String frame : frames) {
933 outputMappingsForPredicates.get(OutputMapping.NBresource).put(nbRoleset.getId(), frame);
934 }
935
936 unlinkedCount++;
937 }
938 } else {
939
940 }
941 }
942
943
944 for (RolesetInfo rolesetInfo : rolesets) {
945 Roleset roleset = rolesetInfo.getRoleset();
946 String rolesetID = rolesetInfo.getLabel();
947
948 for (Object roles : roleset.getNoteOrRolesOrExample()) {
949 if (!(roles instanceof Roles)) {
950 continue;
951 }
952
953 for (Object role : ((Roles) roles).getNoteOrRole()) {
954 if (!(role instanceof eu.fbk.dkm.pikes.resources.util.propbank.Role)) {
955 continue;
956 }
957
958 String n = ((eu.fbk.dkm.pikes.resources.util.propbank.Role) role).getN();
959 String roleStr = rolesetID + "@" + n;
960
961 HashSet<String> tempMappingsForRole = new HashSet<>();
962
963 for (Vnrole vnrole : ((eu.fbk.dkm.pikes.resources.util.propbank.Role) role)
964 .getVnrole()) {
965 String vnClassRole = vnrole.getVncls().toLowerCase();
966 String vnThetaRole =
967 vnClassRole + "@" + vnrole.getVntheta().toLowerCase();
968
969 Set<String> fnFrames = verbnetToFramenet
970 .get(vnThetaRole);
971 tempMappingsForRole.addAll(fnFrames);
972 }
973
974 if (tempMappingsForRole.size() == 1) {
975 for (String frameRole : tempMappingsForRole) {
976
977 String frameName = frameRole.replaceAll("@.*", "");
978 String goodCandidate;
979
980
981 goodCandidate = outputMappingsForPredicates.get(OutputMapping.PBauto)
982 .get(rolesetID);
983 if (goodCandidate == null || !goodCandidate.equals(frameName)) {
984 continue;
985 }
986
987 outputMappingsForRoles.get(OutputMapping.PBauto).put(roleStr, frameRole);
988 roleMappingCount++;
989
990
991 Set<Roleset> fRolesets = nbFrames.get(rolesetID);
992 for (Roleset nbRoleset : fRolesets) {
993
994 String nbRolesetID = nbRoleset.getId();
995
996 boolean isGoodCandidate = false;
997 goodCandidate = outputMappingsForPredicates.get(OutputMapping.NBauto)
998 .get(nbRolesetID);
999 if (goodCandidate == null || !goodCandidate.equals(frameName)) {
1000 isGoodCandidate = true;
1001 }
1002 goodCandidate = outputMappingsForPredicates.get(OutputMapping.NBresource)
1003 .get(nbRolesetID);
1004 if (goodCandidate == null || !goodCandidate.equals(frameName)) {
1005 isGoodCandidate = true;
1006 }
1007
1008 if (!isGoodCandidate) {
1009 continue;
1010 }
1011
1012 String correctN = n;
1013 HashMap<String, String> mappings = nomBankToProbBankRoles.get(nbRolesetID);
1014 if (mappings != null) {
1015 if (mappings.get(n) != null) {
1016 correctN = mappings.get(n);
1017 LOGGER.debug("Editing role...");
1018 }
1019 }
1020
1021 String nbRoleStr = nbRolesetID + "@" + correctN;
1022
1023 outputMappingsForRoles.get(OutputMapping.NBauto).put(nbRoleStr, frameRole);
1024 nbRoleMappingCount++;
1025 }
1026 }
1027 }
1028 }
1029 }
1030 }
1031
1032 LOGGER.info("*** STATISTICS ***");
1033
1034 LOGGER.info("PropBank trivial: {}", trivialCount);
1035 LOGGER.info("PropBank non-trivial: {}", nonTrivialCount);
1036 LOGGER.info("PropBank non-FrameBase: {}", noFrameBaseCount);
1037
1038 LOGGER.info("NomBank (linked): {}", nbCount);
1039 LOGGER.info("NomBank (unlinked): {}", unlinkedCount);
1040 LOGGER.info("NomBank (total): {}", unlinkedCount + nbCount);
1041
1042 LOGGER.info("PropBank (only with SemLink): {}", semlinkCounter);
1043
1044 LOGGER.info("PropBank roles (with SemLink): {}", roleMappingCount);
1045 LOGGER.info("NomBank roles (with SemLink): {}", nbRoleMappingCount);
1046
1047 LOGGER.info("No WordNet relations: {}", emptyRelatedCount);
1048 LOGGER.info("More than one frame: {}", nbGreaterCount);
1049 LOGGER.info("Zero frames: {}", nbZeroCount);
1050 }
1051
1052
1053
1054
1055 LOGGER.info("Parsing examples");
1056
1057 HashMap<String, FrequencyHashSet<String>> rolesCountByType = new HashMap<>();
1058 FrequencyHashSet<String> rolesCount = new FrequencyHashSet<>();
1059 int usedSentences = 0;
1060
1061 for (Sentence sentence : exampleSentences) {
1062 HashMap<Word, HashMap<String, Srl>> srlIndex = new HashMap<>();
1063
1064 for (Srl srl : sentence.getSrls()) {
1065 Word target = srl.getTarget().get(0);
1066
1067
1068 if (!target.getPos().toLowerCase().startsWith("v") && !target.getPos().toLowerCase()
1069 .startsWith("n")) {
1070 continue;
1071 }
1072
1073 if (!srlIndex.containsKey(target)) {
1074 srlIndex.put(target, new HashMap<>());
1075 }
1076 srlIndex.get(target).put(srl.getSource(), srl);
1077 }
1078
1079 for (Word word : srlIndex.keySet()) {
1080 if (srlIndex.get(word).size() > 1) {
1081
1082 usedSentences++;
1083
1084 Srl srlFrameNet = srlIndex.get(word).get("framenet");
1085 Srl srlMate = srlIndex.get(word).get("mate");
1086
1087 String framenet = srlFrameNet.getLabel();
1088 String mate = srlMate.getLabel();
1089
1090 boolean isVerb = true;
1091 if (word.getPos().toLowerCase().startsWith("n")) {
1092 isVerb = false;
1093 }
1094
1095 boolean mappingExists = false;
1096 String frameGuess;
1097
1098 if (isVerb) {
1099 frameGuess = outputMappingsForPredicates.get(OutputMapping.PBauto).get(mate);
1100 if (frameGuess != null && frameGuess.equals(framenet)) {
1101 mappingExists = true;
1102 }
1103 frameGuess = outputMappingsForPredicates.get(OutputMapping.PBtrivial).get(mate);
1104 if (frameGuess != null && frameGuess.equals(framenet)) {
1105 mappingExists = true;
1106 }
1107 } else {
1108 frameGuess = outputMappingsForPredicates.get(OutputMapping.NBauto).get(mate);
1109 if (frameGuess != null && frameGuess.equals(framenet)) {
1110 mappingExists = true;
1111 }
1112 frameGuess = outputMappingsForPredicates.get(OutputMapping.NBresource).get(mate);
1113 if (frameGuess != null && frameGuess.equals(framenet)) {
1114 mappingExists = true;
1115 }
1116 }
1117
1118 if (mappingExists) {
1119
1120 HashMap<Word, String> roleWordsMate = new HashMap<>();
1121 HashMap<Word, String> roleWordsFrameNet = new HashMap<>();
1122
1123
1124 for (eu.fbk.fcw.utils.corpus.Role role : srlMate.getRoles()) {
1125 Word roleHead = role.getSpan().get(0);
1126 String roleLabel = role.getLabel();
1127 roleLabel = roleLabel.replaceAll("R-", "");
1128
1129
1130 if (roleLabel.startsWith("AM-")) {
1131 continue;
1132 }
1133
1134 roleWordsMate.put(roleHead, roleLabel);
1135 }
1136
1137
1138 for (eu.fbk.fcw.utils.corpus.Role role : srlFrameNet.getRoles()) {
1139 Word roleHead = role.getSpan().get(0);
1140 String roleLabel = role.getLabel();
1141 roleWordsFrameNet.put(roleHead, roleLabel);
1142 }
1143
1144 for (Word key : roleWordsMate.keySet()) {
1145 String prefix = isVerb ? "v-" : "n-";
1146 String mateCompressed =
1147 prefix + mate + "@" + roleWordsMate.get(key).replaceAll("[aA]", "");
1148 rolesCount.add(mateCompressed);
1149
1150 if (!rolesCountByType.containsKey(mateCompressed)) {
1151 rolesCountByType.put(mateCompressed, new FrequencyHashSet<>());
1152 }
1153
1154 String fnRole = roleWordsFrameNet.get(key);
1155 if (fnRole != null) {
1156 fnRole = fnRole.toLowerCase();
1157 String fnCompressed = framenet + "@" + fnRole;
1158 rolesCountByType.get(mateCompressed).add(fnCompressed);
1159 } else {
1160 rolesCountByType.get(mateCompressed).add("[none]");
1161 }
1162 }
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176 } else {
1177
1178 }
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189 }
1190 }
1191 }
1192
1193 LOGGER.info("Used sentences: {}", usedSentences);
1194
1195
1196
1197
1198 double okThreshold = 0.5;
1199 int okMinFreq = 2;
1200 HashMap<OutputMapping, HashMap<String, String>> outputMappingsForRolesFromExamples = new HashMap<>();
1201 for (OutputMapping outputMapping : OutputMapping.values()) {
1202 outputMappingsForRolesFromExamples.put(outputMapping, new HashMap<>());
1203 }
1204
1205 for (double threshold = 0.5; threshold < 1; threshold += 0.1) {
1206 for (int minFreq = 2; minFreq <= 10; minFreq++) {
1207 int trivialMappingsCount = 0;
1208 int correctMappingsCount = 0;
1209 int wrongMappingsCount = 0;
1210
1211 HashMap<OutputMapping, HashMap<String, String>> outputMappingsForRolesFromExamplesTemp = new HashMap<>();
1212 for (OutputMapping outputMapping : OutputMapping.values()) {
1213 outputMappingsForRolesFromExamplesTemp.put(outputMapping, new HashMap<>());
1214 }
1215
1216 for (String key : rolesCount.keySet()) {
1217
1218 String candidate = rolesCountByType.get(key).mostFrequent();
1219 int freq = rolesCountByType.get(key).get(candidate);
1220 double ratio = 0.0;
1221 if (candidate != null && !candidate.equals("[none]")) {
1222 ratio = (double) freq / (double) rolesCount.get(key);
1223 } else {
1224 candidate = null;
1225 }
1226 if (ratio > threshold && freq >= minFreq) {
1227 String mate = key.substring(2);
1228 OutputMapping mapping = key.startsWith("v") ? OutputMapping.PBauto : OutputMapping.NBauto;
1229 outputMappingsForRolesFromExamplesTemp.get(mapping).put(mate, candidate);
1230
1231
1232 if (Math.abs(okThreshold - threshold) < 0.01 && minFreq == okMinFreq) {
1233 outputMappingsForRolesFromExamples.get(mapping).put(mate, candidate);
1234 }
1235
1236 String fnRole;
1237
1238 switch (mapping) {
1239 case PBauto:
1240 fnRole = outputMappingsForRoles.get(OutputMapping.PBauto).get(mate);
1241 if (fnRole != null) {
1242 trivialMappingsCount++;
1243 if (fnRole.equals(candidate)) {
1244 correctMappingsCount++;
1245 } else {
1246 wrongMappingsCount++;
1247 }
1248 }
1249 break;
1250 case NBauto:
1251 fnRole = outputMappingsForRoles.get(OutputMapping.NBauto).get(mate);
1252 if (fnRole != null) {
1253 trivialMappingsCount++;
1254 if (fnRole.equals(candidate)) {
1255 correctMappingsCount++;
1256 } else {
1257 wrongMappingsCount++;
1258 }
1259 }
1260 break;
1261 }
1262 }
1263 }
1264
1265
1266
1267
1268
1269 if (printPRTable) {
1270 int tp = correctMappingsCount;
1271 int fp = wrongMappingsCount;
1272 int fn = outputMappingsForRoles.get(OutputMapping.PBauto).size() - trivialMappingsCount + fp;
1273 double precision = (double) tp / (double) (tp + fp);
1274 double recall = (double) tp / (double) (tp + fn);
1275 double f1 = 2 * (precision * recall) / (precision + recall);
1276 System.out.println(String.format(
1277 "%5f %5d %5d %5d %5d %5d %5d %5f %5f %5f",
1278 threshold,
1279 minFreq,
1280 outputMappingsForRolesFromExamplesTemp.get(OutputMapping.PBauto).size(),
1281 outputMappingsForRolesFromExamplesTemp.get(OutputMapping.NBauto).size(),
1282 trivialMappingsCount,
1283 correctMappingsCount,
1284 wrongMappingsCount,
1285 precision,
1286 recall,
1287 f1
1288 )
1289 );
1290 }
1291 }
1292 }
1293
1294
1295
1296
1297 outputFile = new File(outputPattern + "-frames.tsv");
1298 LOGGER.info("Writing output file {}", outputFile.getName());
1299 writer = new BufferedWriter(new FileWriter(outputFile));
1300 for (OutputMapping outputMapping : outputMappingsForPredicates.keySet()) {
1301 for (String key : outputMappingsForPredicates.get(outputMapping).keySet()) {
1302 String value = outputMappingsForPredicates.get(outputMapping).get(key);
1303
1304 IRI fbIRI = null;
1305 FrameBase.POS pos = FrameBase.POS.VERB;
1306 switch (outputMapping) {
1307 case NBauto:
1308 case NBresource:
1309 pos = FrameBase.POS.NOUN;
1310 break;
1311 }
1312
1313 String lemma = predicateToLemma.get(key);
1314 if (lemma != null) {
1315 fbIRI = FrameBase.classFor(value, lemma, pos);
1316 }
1317 if (fbIRI == null) {
1318 lemma = key.substring(0, key.length() - 3);
1319 fbIRI = FrameBase.classFor(value, lemma, pos);
1320 }
1321
1322
1323 if (fbIRI == null) {
1324 LOGGER.error("This should never happen!");
1325 LOGGER.debug(value);
1326 LOGGER.debug(key);
1327 LOGGER.debug(key.substring(0, key.length() - 3));
1328 LOGGER.debug(lemma);
1329 break;
1330 }
1331
1332 writer.append(outputMapping.toString()).append('\t');
1333 writer.append(key).append('\t');
1334 writer.append(value).append('\n');
1335
1336 writerFrames.append(outputMapping.toString().substring(0, 2).toLowerCase()).append(':');
1337 writerFrames.append(key).append('\t');
1338 writerFrames.append(lemma).append('\t');
1339 writerFrames.append(pos.getLetter()).append('\t');
1340 writerFrames.append(fbIRI.toString()).append('\n');
1341 }
1342 }
1343 writer.close();
1344 if (saveFiles) {
1345 outputFile = new File(outputPattern + "-frames.ser");
1346 saveObjectToFile(outputMappingsForPredicates, outputFile);
1347 }
1348
1349 outputFile = new File(outputPattern + "-roles.tsv");
1350 LOGGER.info("Writing output file {}", outputFile.getName());
1351 writer = new BufferedWriter(new FileWriter(outputFile));
1352 for (OutputMapping outputMapping : outputMappingsForRoles.keySet()) {
1353 for (String key : outputMappingsForRoles.get(outputMapping).keySet()) {
1354 String value = outputMappingsForRoles.get(outputMapping).get(key);
1355
1356 String[] parts = value.split("@");
1357 if (parts.length < 2) {
1358 LOGGER.error("This is impossible!");
1359 break;
1360 }
1361 IRI fbIRI = FrameBase.propertyFor(parts[0], parts[1]);
1362 if (fbIRI == null) {
1363 LOGGER.error("This should never happen!");
1364 LOGGER.debug(key);
1365 LOGGER.debug(value);
1366 break;
1367 }
1368
1369 writer.append(outputMapping.toString()).append('\t');
1370 writer.append(key).append('\t');
1371 writer.append(value).append('\t');
1372
1373 writerRoles.append(outputMapping.toString().substring(0, 2).toLowerCase()).append(':');
1374 writerRoles.append(key).append('\t');
1375 writerRoles.append(fbIRI.toString()).append('\n');
1376 }
1377 }
1378 writer.close();
1379 if (saveFiles) {
1380 outputFile = new File(outputPattern + "-roles.ser");
1381 saveObjectToFile(outputMappingsForRoles, outputFile);
1382 }
1383
1384 if (outputMappingsForRolesFromExamples != null) {
1385 outputFile = new File(outputPattern + "-roles-examples.tsv");
1386 LOGGER.info("Writing output file {}", outputFile.getName());
1387 writer = new BufferedWriter(new FileWriter(outputFile));
1388 for (OutputMapping outputMapping : outputMappingsForRolesFromExamples.keySet()) {
1389 for (String key : outputMappingsForRolesFromExamples.get(outputMapping).keySet()) {
1390 String value = outputMappingsForRolesFromExamples.get(outputMapping).get(key);
1391
1392 String[] parts = value.split("@");
1393 if (parts.length < 2) {
1394 LOGGER.error("This is impossible!");
1395 break;
1396 }
1397 IRI fbIRI = FrameBase.propertyFor(parts[0], parts[1]);
1398 if (fbIRI == null) {
1399 LOGGER.error("This should never happen!");
1400 LOGGER.debug(key);
1401 LOGGER.debug(value);
1402 break;
1403 }
1404
1405 writer.append(outputMapping.toString()).append('\t');
1406 writer.append(key).append('\t');
1407 writer.append(value).append('\n');
1408
1409 writerRoles.append(outputMapping.toString().substring(0, 2).toLowerCase()).append(':');
1410 writerRoles.append(key).append('\t');
1411 writerRoles.append(fbIRI.toString()).append('\n');
1412 }
1413 }
1414 writer.close();
1415 }
1416
1417 outputFile = new File(outputPattern + "-add.tsv");
1418 LOGGER.info("Writing output file {}", outputFile.getName());
1419 writer = new BufferedWriter(new FileWriter(outputFile));
1420 for (OutputMapping outputMapping : outputMappingsForPredicatesAdd.keySet()) {
1421 for (String key : outputMappingsForPredicatesAdd.get(outputMapping).keySet()) {
1422 String value = outputMappingsForPredicatesAdd.get(outputMapping).get(key);
1423
1424 writer.append(outputMapping.toString()).append('\t');
1425 writer.append(key).append('\t');
1426 writer.append(value).append('\n');
1427 }
1428 }
1429 writer.close();
1430 if (saveFiles) {
1431 outputFile = new File(outputPattern + "-add.ser");
1432 saveObjectToFile(outputMappingsForPredicatesAdd, outputFile);
1433 }
1434
1435 if (saveFiles) {
1436 outputFile = new File(outputPattern + "-lu-existingFrames.ser");
1437 if (!outputFile.exists()) {
1438 LOGGER.info("Writing object file {}", outputFile.getName());
1439 saveObjectToFile(existingFrames, outputFile);
1440 }
1441
1442 outputFile = new File(outputPattern + "-lu-lus.ser");
1443 if (!outputFile.exists()) {
1444 LOGGER.info("Writing object file {}", outputFile.getName());
1445 saveObjectToFile(lus, outputFile);
1446 }
1447
1448 outputFile = new File(outputPattern + "-lu-exampleSentences.ser");
1449 if (!outputFile.exists()) {
1450 LOGGER.info("Writing object file {}", outputFile.getName());
1451 saveObjectToFile(exampleSentences, outputFile);
1452 }
1453 }
1454
1455 writerFrames.close();
1456 writerRoles.close();
1457
1458 } catch (Exception e) {
1459 CommandLine.fail(e);
1460 }
1461 }
1462
1463 private static Object loadObjectFromFile(File inputFile) throws IOException {
1464 ObjectInputStream objectinputstream = null;
1465 FileInputStream streamIn = null;
1466 try {
1467 streamIn = new FileInputStream(inputFile);
1468 objectinputstream = new ObjectInputStream(streamIn);
1469 return objectinputstream.readObject();
1470 } catch (Exception e) {
1471 e.printStackTrace();
1472 } finally {
1473 if (objectinputstream != null) {
1474 objectinputstream.close();
1475 }
1476 }
1477 return null;
1478 }
1479
1480 private static void saveObjectToFile(Object o, File outputFile) throws IOException {
1481 ObjectOutputStream oos = null;
1482 FileOutputStream fout = null;
1483 try {
1484 fout = new FileOutputStream(outputFile);
1485 oos = new ObjectOutputStream(fout);
1486 oos.writeObject(o);
1487 } catch (Exception e) {
1488 e.printStackTrace();
1489 } finally {
1490 if (oos != null) {
1491 oos.close();
1492 }
1493 }
1494 }
1495
1496 private static List<Integer> getSpan(String text, LabelType label) {
1497 List<Integer> ret = new ArrayList<>();
1498
1499 Integer start = label.getStart();
1500 if (start == null) {
1501 return null;
1502 }
1503
1504 Integer end = label.getEnd();
1505 String before = text.substring(0, start);
1506 before = before.replaceAll("\\s+", " ");
1507 int target = before.replaceAll("[^\\s]", "").length();
1508 String inside = text.substring(start, end);
1509 inside = inside.replaceAll("\\s+", " ");
1510 int length = inside.replaceAll("[^\\s]", "").length() + 1;
1511
1512 for (int i = 0; i < length; i++) {
1513 ret.add(target + i);
1514 }
1515
1516 return ret;
1517 }
1518
1519 private static HashMap<String, HashMap<String, Set>> getSenses(String name, File ontonotesFolder, String fnLemma,
1520 String type, Unmarshaller onUnmarshaller)
1521 throws JAXBException {
1522
1523 HashMap<String, HashMap<String, Set>> senses = new HashMap<>();
1524
1525
1526 File onSense = new File(ontonotesFolder.getAbsolutePath() + File.separator + name);
1527 if (onSense.exists()) {
1528
1529 Inventory inventory = (Inventory) onUnmarshaller.unmarshal(onSense);
1530 for (Sense sense : inventory.getSense()) {
1531
1532 if (sense.getMappings() == null) {
1533 continue;
1534 }
1535
1536 Set<String> onWn = new HashSet<>();
1537 Set<String> onFn = new HashSet<>();
1538 Set<String> onPb = new HashSet<>();
1539
1540
1541 if (sense.getMappings().getPb() != null) {
1542 String[] pbs = sense.getMappings().getPb().split(",");
1543 for (String pb : pbs) {
1544 pb = pb.trim();
1545 if (pb.length() == 0) {
1546 continue;
1547 }
1548 onPb.add(pb);
1549 }
1550 }
1551
1552
1553 if (sense.getMappings().getFn() != null) {
1554 String[] fns = sense.getMappings().getFn().split(",");
1555 for (String fn : fns) {
1556 fn = fn.trim().toLowerCase();
1557 if (fn.length() == 0) {
1558 continue;
1559 }
1560 onFn.add(fn);
1561 }
1562 }
1563
1564
1565 try {
1566 for (Wn wn : sense.getMappings().getWn()) {
1567 String lemma = wn.getLemma();
1568 if (lemma == null || lemma.length() == 0) {
1569 lemma = fnLemma;
1570 }
1571 String value = wn.getvalue();
1572 String[] ids = value.split(",");
1573 for (String id : ids) {
1574 id = id.trim();
1575 if (id.length() == 0) {
1576 continue;
1577 }
1578 String synsetID = WordNet.getSynsetID(lemma + "-" + id + type);
1579 onWn.add(synsetID);
1580 }
1581 }
1582 } catch (Exception e) {
1583
1584 }
1585
1586 for (String pb : onPb) {
1587 if (!senses.containsKey(pb)) {
1588 senses.put(pb, new HashMap<>());
1589 }
1590 if (!senses.get(pb).containsKey("wn")) {
1591 senses.get(pb).put("wn", new HashSet<>());
1592 }
1593 if (!senses.get(pb).containsKey("fn")) {
1594 senses.get(pb).put("fn", new HashSet<>());
1595 }
1596 senses.get(pb).get("wn").addAll(onWn);
1597 senses.get(pb).get("fn").addAll(onFn);
1598 }
1599 }
1600 }
1601
1602 return senses;
1603 }
1604
1605 private static Collection<String> getCandidateFrames(Collection<String> wnCandidates,
1606 Collection<String> fnCandidates,
1607 String lemma, String type, HashMultimap<String, String> fbFramenetToWordNet,
1608 boolean ignoreLemmaInFrameBaseMappings) {
1609
1610 Collection<String> okFrames = new HashSet<>();
1611 for (String fnCandidate : fnCandidates) {
1612 String key = getFrameBaseKey(fnCandidate, lemma, type, ignoreLemmaInFrameBaseMappings);
1613 Collection<String> wnCandidatesForThisFrame = new HashSet<>(fbFramenetToWordNet.get(key));
1614 wnCandidatesForThisFrame.retainAll(wnCandidates);
1615 if (wnCandidatesForThisFrame.size() > 0) {
1616 okFrames.add(fnCandidate);
1617 }
1618 }
1619
1620 return okFrames;
1621 }
1622
1623 private static String getFrameBaseKey(String frame, String lemma, String type,
1624 boolean ignoreLemmaInFrameBaseMappings) {
1625 return getFrameBaseKey(frame, lemma + "." + type, ignoreLemmaInFrameBaseMappings);
1626 }
1627
1628 private static String getFrameBaseKey(String frame, String lemma, boolean ignoreLemmaInFrameBaseMappings) {
1629 if (ignoreLemmaInFrameBaseMappings) {
1630 return frame;
1631 }
1632 return frame + "-" + lemma;
1633 }
1634 }