1   package eu.fbk.dkm.pikes.tintop.util.framenet;
2   
3   import ch.qos.logback.classic.Level;
4   import com.google.common.base.Charsets;
5   import com.google.common.collect.HashMultimap;
6   import com.google.common.io.Files;
7   import eu.fbk.dkm.pikes.resources.FrameBase;
8   import eu.fbk.dkm.pikes.resources.WordNet;
9   import eu.fbk.fcw.utils.corpus.Corpus;
10  import eu.fbk.fcw.utils.corpus.Sentence;
11  import eu.fbk.fcw.utils.corpus.Srl;
12  import eu.fbk.fcw.utils.corpus.Word;
13  import eu.fbk.dkm.pikes.resources.util.fnlu.*;
14  import eu.fbk.dkm.pikes.resources.util.onsenses.Inventory;
15  import eu.fbk.dkm.pikes.resources.util.onsenses.Sense;
16  import eu.fbk.dkm.pikes.resources.util.onsenses.Wn;
17  import eu.fbk.dkm.pikes.resources.util.propbank.*;
18  import eu.fbk.dkm.pikes.resources.util.semlink.vnfn.SemLinkRoot;
19  import eu.fbk.dkm.pikes.resources.util.semlink.vnfnroles.Role;
20  import eu.fbk.dkm.pikes.resources.util.semlink.vnfnroles.SemLinkRolesRoot;
21  import eu.fbk.dkm.pikes.resources.util.semlink.vnfnroles.Vncls;
22  import eu.fbk.dkm.pikes.resources.util.semlink.vnpb.Argmap;
23  import eu.fbk.dkm.pikes.resources.util.semlink.vnpb.PbvnTypemap;
24  import eu.fbk.utils.core.CommandLine;
25  import eu.fbk.utils.core.FrequencyHashSet;
26  import net.didion.jwnl.data.PointerType;
27  import org.eclipse.rdf4j.model.IRI;
28  import org.slf4j.Logger;
29  import org.slf4j.LoggerFactory;
30  
31  import javax.xml.bind.JAXBContext;
32  import javax.xml.bind.JAXBException;
33  import javax.xml.bind.Unmarshaller;
34  import java.io.*;
35  import java.util.*;
36  import java.util.regex.Matcher;
37  import java.util.regex.Pattern;
38  
39  /**
40   * Created by alessio on 12/11/15.
41   */
42  
43  // *todo: verificare parentesi
44  // *todo: aggiungere semlink
45  // *todo: separare estrazioni
46  // *todo: verificare +
47  // *todo: dividere nofb su altro file
48  
49  public class MergeMateFramenet {
50  
51      // Google docs: https://docs.google.com/document/d/1Uexv8352v0eI1Ij1I5j3U9cOHFNKPHlbqCSFTJcFTz4/edit#
52  
53      private static final Logger LOGGER = LoggerFactory.getLogger(MergeMateFramenet.class);
54      private static HashMap<String, String> lemmaToTransform = new HashMap();
55      static final Pattern ONTONOTES_FILENAME_PATTERN = Pattern.compile("(.*)-([a-z]+)\\.xml");
56      static final Pattern FRAMEBASE_PATTERN = Pattern
57              .compile("^[^\\s]+\\s+[^\\s]+\\s+([^\\s]+)\\s+-\\s+(.+)\\s+-\\s+([a-z])#([0-9]+)$");
58      static final Pattern PB_PATTERN = Pattern.compile("^verb-((.*)\\.[0-9]+)$");
59  
60      // Must contain frames including '-' (see README)
61      //todo: use a better method to add FrameNet links to FrameBase
62      static final Pattern FB_PREDICATES = Pattern.compile("^frame-(Chemical-sense_description|Non-commutative_process|Non-commutative_statement|[^-]*)-(.*)\\.([a-z]+)$");
63  
64      static final Pattern FB_ROLES = Pattern.compile("^fe-(.*)-(.*)$");
65  
66      public enum OutputMapping {
67          PBauto, NBauto, NBresource, PBtrivial
68      }
69  
70      static {
71          lemmaToTransform.put("cry+down(e)", "cry+down");
72      }
73  
74      /**
75       * Format the lemma for compatibility between datasets.
76       * In particular, spaces and underscores are replaced by '+'
77       *
78       * @param lemmaFromPredicate the input lemma
79       * @return the converted lemma
80       */
81      protected static String getLemmaFromPredicateName(String lemmaFromPredicate) {
82          String lemma = lemmaFromPredicate.replace('_', '+')
83                  .replace(' ', '+');
84          if (lemmaToTransform.keySet().contains(lemma)) {
85              lemma = lemmaToTransform.get(lemma);
86          }
87          return lemma;
88      }
89  
90      /**
91       * Intersect collections of strings, ignoring empty sets
92       *
93       * @param collections input collection(s)
94       * @return the resulting collection (intersection)
95       */
96      private static Collection<String> getIntersection(Collection<String>... collections) {
97          return getIntersection(true, collections);
98      }
99  
100     /**
101      * Intersect collections of strings
102      *
103      * @param ignoreEmptySets select whether ignoring empty sets for the intersection
104      * @param collections     input collection(s)
105      * @return the resulting collection (intersection)
106      */
107     private static Collection<String> getIntersection(boolean ignoreEmptySets, Collection<String>... collections) {
108         Collection<String> ret = null;
109         for (Collection<String> collection : collections) {
110             if (ignoreEmptySets && (collection == null || collection.size() == 0)) {
111                 continue;
112             }
113             if (ret == null) {
114                 ret = new HashSet<>();
115                 ret.addAll(collection);
116             } else {
117                 ret.retainAll(collection);
118             }
119         }
120 
121         if (ret == null) {
122             ret = new HashSet<>();
123         }
124         return ret;
125     }
126 
127     private static ArrayList<Matcher> getPropBankPredicates(Roleset roleset) {
128 
129         ArrayList<Matcher> ret = new ArrayList<>();
130 
131         String source = roleset.getSource();
132         if (source != null && source.length() > 0) {
133 
134             String[] parts = source.split("\\s+");
135             for (String part : parts) {
136                 if (part.trim().length() == 0) {
137                     continue;
138                 }
139 
140                 Matcher matcher = PB_PATTERN.matcher(source);
141                 if (!matcher.find()) {
142                     continue;
143                 }
144 
145                 ret.add(matcher);
146             }
147         }
148 
149         return ret;
150     }
151 
152     public static void main(String[] args) {
153         try {
154             final CommandLine cmd = CommandLine
155                     .parser()
156                     .withName("./merger")
157                     .withHeader("Transform linguistic resources into RDF")
158                     .withOption("p", "propbank", "PropBank folder", "FOLDER",
159                             CommandLine.Type.DIRECTORY_EXISTING, true, false, true)
160                     .withOption("w", "wordnet", "WordNet folder", "FOLDER", CommandLine.Type.DIRECTORY_EXISTING, true,
161                             false, true)
162                     .withOption("o", "ontonotes", "Ontonotes senses folder", "FOLDER",
163                             CommandLine.Type.DIRECTORY_EXISTING, true, false, true)
164                     .withOption("l", "lu", "FrameNet LU folder", "FOLDER",
165                             CommandLine.Type.DIRECTORY_EXISTING, true, false, true)
166                     .withOption(null, "lu-parsed", "FrameNet LU folder (parsed, in CoNLL format)", "FOLDER",
167                             CommandLine.Type.DIRECTORY_EXISTING, true, false, true)
168                     .withOption("f", "framebase", "FrameBase FrameNet-WordNet map", "FILE",
169                             CommandLine.Type.FILE_EXISTING, true, false, true)
170                     .withOption("s", "semlink", "SemLink folder", "FOLDER",
171                             CommandLine.Type.DIRECTORY_EXISTING, true, false, true)
172                     .withOption("n", "nombank", "NomBank folder", "FOLDER",
173                             CommandLine.Type.DIRECTORY_EXISTING, true, false, true)
174                     .withOption("c", "close-match", "closeMatch file from PreMOn", "FILE",
175                             CommandLine.Type.FILE_EXISTING, true, false, true)
176                     .withOption(null, "ignore-lemma", "ignore lemma information")
177                     .withOption(null, "save-files", "serialize big files")
178                     .withOption(null, "print-pr-table", "print precision/recall table")
179                     .withOption("O", "output", "Output file prefix", "PREFIX",
180                             CommandLine.Type.STRING, true, false, true)
181                     .withOption(null, "enable-sl4p",
182                             "Enable extraction of frames using SemLink when framnet argument of roleset is empty")
183                     .withLogger(LoggerFactory.getLogger("eu.fbk")).parse(args);
184 
185             ((ch.qos.logback.classic.Logger) LOGGER).setLevel(Level.INFO);
186 
187             File pbFolder = cmd.getOptionValue("propbank", File.class);
188             File nbFolder = cmd.getOptionValue("nombank", File.class);
189             File wordnetFolder = cmd.getOptionValue("wordnet", File.class);
190             File ontonotesFolder = cmd.getOptionValue("ontonotes", File.class);
191             File framebaseFile = cmd.getOptionValue("framebase", File.class);
192             File closeMatchFile = cmd.getOptionValue("close-match", File.class);
193             File luFolder = cmd.getOptionValue("lu", File.class);
194             File luParsedFolder = cmd.getOptionValue("lu-parsed", File.class);
195             File semlinkFolder = cmd.getOptionValue("semlink", File.class);
196 
197             String outputPattern = cmd.getOptionValue("output", String.class);
198 
199             boolean enableSemLinkForPredicates = cmd.hasOption("enable-sl4p");
200             boolean saveFiles = cmd.hasOption("save-files");
201             boolean printPRTable = cmd.hasOption("print-pr-table");
202 
203             boolean ignoreLemmaInFrameBaseMappings = cmd.hasOption("ignore-lemma");
204 
205             // Start
206 
207             Integer max = null;
208 
209             WordNet.setPath(wordnetFolder.getAbsolutePath());
210             WordNet.init();
211 
212             JAXBContext fnContext = JAXBContext.newInstance(Frameset.class);
213             Unmarshaller fnUnmarshaller = fnContext.createUnmarshaller();
214 
215             JAXBContext onContext = JAXBContext.newInstance(Inventory.class);
216             Unmarshaller onUnmarshaller = onContext.createUnmarshaller();
217 
218             JAXBContext luContext = JAXBContext.newInstance(LexUnit.class);
219             Unmarshaller luUnmarshaller = luContext.createUnmarshaller();
220 
221             JAXBContext semlinkContext = JAXBContext.newInstance(SemLinkRoot.class);
222             Unmarshaller semlinkUnmarshaller = semlinkContext.createUnmarshaller();
223 
224             JAXBContext semlinkRolesContext = JAXBContext.newInstance(SemLinkRolesRoot.class);
225             Unmarshaller semlinkRolesUnmarshaller = semlinkRolesContext.createUnmarshaller();
226 
227             JAXBContext semlinkPbContext = JAXBContext.newInstance(PbvnTypemap.class);
228             Unmarshaller semlinkPbUnmarshaller = semlinkPbContext.createUnmarshaller();
229 
230             BufferedWriter writer, writerFrames, writerRoles;
231             File outputFile, outputFileFrames, outputFileRoles;
232             outputFileFrames = new File(outputPattern + "-frames-ok.tsv");
233             outputFileRoles = new File(outputPattern + "-roles-ok.tsv");
234             writerFrames = new BufferedWriter(new FileWriter(outputFileFrames));
235             writerRoles = new BufferedWriter(new FileWriter(outputFileRoles));
236 
237 
238             // Trivial frames/roles from FrameBase
239 
240             for (String predicate : FrameBase.getPredicatesSet()) {
241                 Matcher matcher = FB_PREDICATES.matcher(predicate.trim());
242                 if (!matcher.find()) {
243                     LOGGER.error("{} is not correctly formatted", predicate);
244                     continue;
245                 }
246 
247                 IRI fbIRI = FrameBase.uriFor(predicate);
248                 if (fbIRI == null) {
249                     LOGGER.error("This should never happen!");
250                     LOGGER.debug(predicate);
251                     break;
252                 }
253 
254                 String lemma = matcher.group(2).toLowerCase();
255                 lemma = lemma.replaceAll("\\(\\(.*\\)\\)", "");
256                 lemma = lemma.replaceAll("\\(", "");
257                 lemma = lemma.replaceAll("\\)", "");
258                 lemma = lemma.replace('_', ' ');
259                 lemma = lemma.trim();
260                 lemma = lemma.replace(' ', '_');
261 
262                 writerFrames.append("fn:");
263                 writerFrames.append(matcher.group(1).toLowerCase()).append('\t');
264                 writerFrames.append(lemma).append('\t');
265                 writerFrames.append(matcher.group(3).toLowerCase()).append('\t');
266                 writerFrames.append(fbIRI.toString()).append('\n');
267             }
268             for (String role : FrameBase.getRolesSet()) {
269                 Matcher matcher = FB_ROLES.matcher(role.trim());
270                 if (!matcher.find()) {
271                     LOGGER.error("{} is not correctly formatted", role);
272                     continue;
273                 }
274 
275                 IRI fbIRI = FrameBase.uriFor(role);
276                 if (fbIRI == null) {
277                     LOGGER.error("This should never happen!");
278                     LOGGER.debug(role);
279                     break;
280                 }
281 
282                 String roleAt = matcher.group(1) + "@" + matcher.group(2);
283                 roleAt = roleAt.toLowerCase();
284 
285                 writerRoles.append("fn:");
286                 writerRoles.append(roleAt).append('\t');
287                 writerRoles.append(fbIRI.toString()).append('\n');
288             }
289 
290 //            writerFrames.close();
291 //            writerRoles.close();
292 //            System.exit(1);
293 
294 
295             // closeMatch
296 
297             LOGGER.info("Loading closeMatches");
298             HashMap<String, HashMap<String, String>> nomBankToProbBankRoles = new HashMap<>();
299             Pattern CLOSEMATCH_PATTERN = Pattern.compile("nb10-(.*?)-arg([0-9])>.*pbon5-(.*?)-arg([0-9])>");
300             List<String> closeMatchLines = Files.readLines(closeMatchFile, Charsets.UTF_8);
301             for (String line : closeMatchLines) {
302                 line = line.trim();
303                 Matcher matcher = CLOSEMATCH_PATTERN.matcher(line);
304                 if (matcher.find()) {
305                     if (matcher.group(2).equals(matcher.group(4))) {
306                         continue;
307                     }
308 
309                     String nbPredicate = matcher.group(1);
310 
311                     if (!nomBankToProbBankRoles.containsKey(nbPredicate)) {
312                         nomBankToProbBankRoles.put(nbPredicate, new HashMap<>());
313                     }
314 
315                     nomBankToProbBankRoles.get(nbPredicate).put(matcher.group(2), matcher.group(4));
316                 }
317             }
318 
319 
320             // SemLink
321 
322             LOGGER.info("Loading SemLink");
323             File semlinkFile;
324 
325             semlinkFile = new File(semlinkFolder.getAbsolutePath() + File.separator + "vn-pb" + File.separator
326                     + "vnpbMappings");
327             PbvnTypemap semLinkPb = (PbvnTypemap) semlinkPbUnmarshaller.unmarshal(semlinkFile);
328 
329             HashMultimap<String, String> verbnetToPropbank = HashMultimap.create();
330             HashMultimap<String, String> propbankToVerbnet = HashMultimap.create();
331 
332             for (eu.fbk.dkm.pikes.resources.util.semlink.vnpb.Predicate predicate : semLinkPb.getPredicate()) {
333                 String lemma = predicate.getLemma();
334                 Argmap argmap = predicate.getArgmap();
335                 if (argmap == null) {
336                     continue;
337                 }
338 
339                 String pbFrame = argmap.getPbRoleset().toLowerCase();
340                 String vnClass = argmap.getVnClass().toLowerCase();
341 
342                 verbnetToPropbank.put(vnClass, pbFrame);
343                 propbankToVerbnet.put(pbFrame, vnClass);
344 
345                 for (eu.fbk.dkm.pikes.resources.util.semlink.vnpb.Role role : argmap.getRole()) {
346                     String pbArg = pbFrame + "@" + role.getPbArg().toLowerCase();
347                     String vnTheta = vnClass + "@" + role.getVnTheta().toLowerCase();
348 
349                     verbnetToPropbank.put(vnTheta, pbArg);
350                     propbankToVerbnet.put(pbArg, vnTheta);
351                 }
352 
353             }
354 
355             semlinkFile = new File(semlinkFolder.getAbsolutePath() + File.separator + "vn-fn" + File.separator
356                     + "VN-FNRoleMapping.txt");
357             SemLinkRolesRoot semLinkRoles = (SemLinkRolesRoot) semlinkRolesUnmarshaller.unmarshal(semlinkFile);
358 
359             HashMultimap<String, String> verbnetToFramenet = HashMultimap.create();
360             HashMultimap<String, String> framenetToVerbnet = HashMultimap.create();
361 
362             for (Vncls vncls : semLinkRoles.getVncls()) {
363                 String frame = vncls.getFnframe().toLowerCase();
364                 String vnClass = vncls.getClazz().toLowerCase();
365 
366                 verbnetToFramenet.put(vnClass, frame);
367                 framenetToVerbnet.put(frame, vnClass);
368 
369                 if (vncls.getRoles() == null) {
370                     continue;
371                 }
372 
373                 for (Role role : vncls.getRoles().getRole()) {
374                     String fnRole = frame + "@" + role.getFnrole().toLowerCase();
375                     String vnRole = vnClass + "@" + role.getVnrole().toLowerCase();
376 
377                     verbnetToFramenet.put(vnRole, fnRole);
378                     framenetToVerbnet.put(fnRole, vnRole);
379                 }
380             }
381 
382             semlinkFile = new File(
383                     semlinkFolder.getAbsolutePath() + File.separator + "vn-fn" + File.separator + "VNC-FNF.s");
384             SemLinkRoot semLink = (SemLinkRoot) semlinkUnmarshaller.unmarshal(semlinkFile);
385 
386             for (eu.fbk.dkm.pikes.resources.util.semlink.vnfn.Vncls vncls : semLink.getVncls()) {
387                 String vnClass = vncls.getClazz().toLowerCase();
388                 String frame = vncls.getFnframe().toLowerCase();
389 
390                 verbnetToFramenet.put(vnClass, frame);
391                 framenetToVerbnet.put(frame, vnClass);
392             }
393 
394 
395             // NomBank
396 
397             int nbSource = 0;
398 
399             LOGGER.info("Loading NomBank files");
400             HashMultimap<String, Roleset> nbFrames = HashMultimap.create();
401             HashSet<Roleset> nbUnlinked = new HashSet<>();
402             for (File file : Files.fileTreeTraverser().preOrderTraversal(nbFolder)) {
403 
404                 if (!file.isFile()) {
405                     continue;
406                 }
407 
408                 if (!file.getName().endsWith(".xml")) {
409                     continue;
410                 }
411 
412                 LOGGER.debug(file.getName());
413 
414                 Frameset frameset = (Frameset) fnUnmarshaller.unmarshal(file);
415                 List<Object> noteOrPredicate = frameset.getNoteOrPredicate();
416                 for (Object predicate : noteOrPredicate) {
417                     if (predicate instanceof Predicate) {
418                         String lemma = ((Predicate) predicate).getLemma();
419                         List<Object> noteOrRoleset = ((Predicate) predicate).getNoteOrRoleset();
420                         for (Object roleset : noteOrRoleset) {
421                             if (roleset instanceof Roleset) {
422 
423                                 // Warning: this is really BAD!
424                                 ((Roleset) roleset).setName(lemma);
425 
426                                 ArrayList<Matcher> predicates = getPropBankPredicates((Roleset) roleset);
427                                 for (Matcher matcher : predicates) {
428                                     String pb = matcher.group(1);
429                                     nbFrames.put(pb, (Roleset) roleset);
430                                     nbSource++;
431                                 }
432 
433                                 if (predicates.size() == 0) {
434                                     nbUnlinked.add((Roleset) roleset);
435                                 }
436                             }
437                         }
438                     }
439                 }
440             }
441 
442             LOGGER.info("Loaded {} rolesets with source", nbSource);
443             LOGGER.info("Loaded {} frames without source", nbUnlinked.size());
444 
445 
446             // FrameNet LUs
447 
448             LOGGER.info("Loading LU files");
449             int i = 0;
450             HashMap<String, HashMultimap<String, String>> lus = new HashMap<>();
451             HashSet<String> existingFrames = new HashSet<>();
452             List<Sentence> exampleSentences = new ArrayList<>();
453 
454             File existingFramesFile = new File(outputPattern + "-lu-existingFrames.ser");
455             File lusFile = new File(outputPattern + "-lu-lus.ser");
456             File exampleSentencesFile = new File(outputPattern + "-lu-exampleSentences.ser");
457             if (existingFramesFile.exists() && lusFile.exists() && exampleSentencesFile.exists()) {
458                 LOGGER.info("Loading data from files");
459                 existingFrames = (HashSet<String>) loadObjectFromFile(existingFramesFile);
460                 lus = (HashMap<String, HashMultimap<String, String>>) loadObjectFromFile(lusFile);
461                 exampleSentences = (List<Sentence>) loadObjectFromFile(exampleSentencesFile);
462             } else {
463                 for (File file : Files.fileTreeTraverser().preOrderTraversal(luFolder)) {
464                     if (!file.isFile()) {
465                         continue;
466                     }
467 
468                     if (!file.getName().endsWith(".xml")) {
469                         continue;
470                     }
471 
472                     LOGGER.debug(file.getName());
473                     i++;
474                     if (max != null && i > max) {
475                         break;
476                     }
477 
478                     LexUnit lexUnit = (LexUnit) luUnmarshaller.unmarshal(file);
479                     String lemma = "";
480                     POSType posType = lexUnit.getPOS();
481                     for (LexemeType lexeme : lexUnit.getLexeme()) {
482                         lemma = lemma + " " + lexeme.getName();
483                     }
484                     lemma = lemma.trim();
485 
486                     if (lemma.length() == 0 || posType == null) {
487                         LOGGER.error("Lemma or POS null ({}/{})", lemma, posType);
488                         continue;
489                     }
490                     String pos = posType.toString().toLowerCase();
491                     String frame = lexUnit.getFrame().toLowerCase();
492 
493                     // Get examples from parsed file
494                     Corpus corpus = null;
495                     File parsedFile = new File(luParsedFolder + File.separator + file.getName() + ".conll");
496                     if (parsedFile.exists()) {
497                         corpus = Corpus.readDocumentFromFile(parsedFile.getAbsolutePath(), "mate");
498                     }
499 
500                     // Merge examples
501                     int exampleNo = 0;
502                     if (corpus != null) {
503                         for (SubCorpusType subCorpus : lexUnit.getSubCorpus()) {
504                             for (SentenceType sentence : subCorpus.getSentence()) {
505                                 String text = sentence.getText();
506                                 if (text != null && text.length() > 0) {
507 
508                                     Sentence conllSentence = corpus.getSentences().get(exampleNo++);
509 
510                                     // This is an example
511                                     List<Integer> target = new ArrayList<>();
512                                     HashMultimap<String, List<Integer>> roles = HashMultimap.create();
513 
514                                     for (AnnotationSetType annotationSet : sentence.getAnnotationSet()) {
515                                         for (LayerType layer : annotationSet.getLayer()) {
516                                             String name = layer.getName();
517                                             if (name.equals("Target")) {
518                                                 for (LabelType label : layer.getLabel()) {
519                                                     target = getSpan(text, label);
520 
521                                                     // Target should be unique...
522                                                     break;
523                                                 }
524                                             }
525                                             if (name.equals("FE")) {
526                                                 for (LabelType label : layer.getLabel()) {
527                                                     List<Integer> span = getSpan(text, label);
528                                                     if (span == null) {
529                                                         continue;
530                                                     }
531                                                     roles.put(label.getName(), span);
532                                                 }
533 
534                                             }
535                                         }
536                                     }
537 
538                                     if (target == null || target.size() == 0) {
539                                         LOGGER.error("Target not found");
540                                         continue;
541                                     }
542 
543                                     try {
544                                         Integer targetHead = conllSentence.searchHead(target);
545                                         Srl srl = new Srl(conllSentence.getWords().get(targetHead), frame, "framenet");
546                                         for (String roleLabel : roles.keySet()) {
547                                             Set<List<Integer>> spans = roles.get(roleLabel);
548                                             for (List<Integer> span : spans) {
549                                                 Integer roleHead = conllSentence.searchHead(span);
550                                                 eu.fbk.fcw.utils.corpus.Role role = new eu.fbk.fcw.utils.corpus.Role(
551                                                         conllSentence.getWords().get(roleHead), roleLabel);
552                                                 srl.addRole(role);
553                                             }
554                                         }
555                                         conllSentence.addSrl(srl);
556                                     } catch (Exception e) {
557 
558                                         LOGGER.error("Error in aligning tokens");
559 
560 //                                    System.out.println(conllSentence);
561 //                                    System.out.println(file.getName());
562 //                                    System.out.println(lemma);
563 //                                    System.out.println(text);
564 //                                    System.out.println(frame);
565 //                                    System.out.println(target);
566 //                                    System.out.println(roles);
567 //                                    System.out.println();
568                                     }
569 
570                                     exampleSentences.add(conllSentence);
571                                 }
572                             }
573                         }
574                     }
575 
576                     existingFrames.add(frame);
577 //                Matcher matcher = LU_PATTERN.matcher(lemma);
578 //                if (!matcher.matches()) {
579 //                    LOGGER.error("{} does not match", lemma);
580 //                    continue;
581 //                }
582 
583 //                lemma = matcher.group(1);
584 //                lemma = getLemmaFromPredicateName(lemma);
585 //                String pos = matcher.group(2);
586 
587                     if (lus.get(pos) == null) {
588                         lus.put(pos, HashMultimap.create());
589                     }
590 
591                     lus.get(pos).put(lemma, frame);
592                 }
593             }
594 
595 
596             // FrameBase
597 
598             LOGGER.info("Load FrameBase file");
599             HashMultimap<String, String> fbFramenetToWordNet = HashMultimap.create();
600 
601             List<String> lines = Files.readLines(framebaseFile, Charsets.UTF_8);
602             for (String line : lines) {
603                 line = line.trim();
604                 if (line.length() == 0) {
605                     continue;
606                 }
607 
608                 Matcher matcher = FRAMEBASE_PATTERN.matcher(line);
609                 if (!matcher.matches()) {
610                     continue;
611                 }
612 
613                 String frame = matcher.group(1).toLowerCase();
614                 String lemma = matcher.group(2);
615                 lemma = getLemmaFromPredicateName(lemma);
616                 String wnSynset = WordNet.getSynsetID(Long.parseLong(matcher.group(4)), matcher.group(3));
617 
618                 String key = getFrameBaseKey(frame, lemma, ignoreLemmaInFrameBaseMappings);
619 
620                 fbFramenetToWordNet.put(key, wnSynset);
621             }
622 
623 //            for (String key : fbFramenetToWordNet.keySet()) {
624 //                System.out.println(key + " -> " + fbFramenetToWordNet.get(key));
625 //            }
626 
627 
628             // PropBank
629 
630             LOGGER.info("Reading PropBank files");
631             List<RolesetInfo> rolesets = new ArrayList<>();
632             Map<String, String> predicateToLemma = new HashMap<>();
633 
634             // Warning: this collects only verbs!
635             for (File file : Files.fileTreeTraverser().preOrderTraversal(pbFolder)) {
636 
637                 if (!file.isFile()) {
638                     continue;
639                 }
640 
641                 if (!file.getName().endsWith(".xml")) {
642                     continue;
643                 }
644 
645                 //todo: check ontonotes or not
646                 String type;
647                 String baseLemma;
648                 Matcher matcher = ONTONOTES_FILENAME_PATTERN.matcher(file.getName());
649                 if (matcher.matches()) {
650                     type = matcher.group(2);
651                     baseLemma = matcher.group(1);
652                 } else {
653                     throw new Exception(
654                             "File " + file.getName() + " does not appear to be a good OntoNotes frame file");
655                 }
656 
657                 if (!type.equals("v")) {
658                     continue;
659                 }
660 
661                 LOGGER.debug(file.getName());
662 
663                 HashMap<String, HashMap<String, Set>> senses = getSenses(file.getName(), ontonotesFolder, baseLemma,
664                         type, onUnmarshaller);
665 
666                 Frameset frameset = (Frameset) fnUnmarshaller.unmarshal(file);
667                 List<Object> noteOrPredicate = frameset.getNoteOrPredicate();
668 
669                 for (Object predicate : noteOrPredicate) {
670                     if (predicate instanceof Predicate) {
671 
672                         String lemma = getLemmaFromPredicateName(((Predicate) predicate).getLemma());
673 
674                         List<String> synsets = WordNet.getSynsetsForLemma(lemma.replace('+', ' '), type);
675 
676                         Set<String> luFrames = lus.get(type).get(lemma);
677                         luFrames.retainAll(existingFrames);
678 
679                         List<Object> noteOrRoleset = ((Predicate) predicate).getNoteOrRoleset();
680                         for (Object roleset : noteOrRoleset) {
681                             if (roleset instanceof Roleset) {
682                                 String rolesetID = ((Roleset) roleset).getId();
683                                 predicateToLemma.put(rolesetID, lemma.replace('+', '_'));
684 
685                                 RolesetInfo rolesetInfo = new RolesetInfo(file, rolesetID, baseLemma, lemma, type,
686                                         senses, luFrames, (Roleset) roleset, synsets);
687                                 rolesets.add(rolesetInfo);
688                             }
689                         }
690                     }
691                 }
692             }
693 
694 
695             // Main loop
696 
697             int trivialCount = 0;
698             int nonTrivialCount = 0;
699             int nbCount = 0;
700             int emptyRelatedCount = 0;
701             int nbGreaterCount = 0;
702             int nbZeroCount = 0;
703             int unlinkedCount = 0;
704             int roleMappingCount = 0;
705             int nbRoleMappingCount = 0;
706             int noFrameBaseCount = 0;
707             int semlinkCounter = 0;
708 
709             HashMap<OutputMapping, HashMap<String, String>> outputMappingsForPredicates = new HashMap<>();
710             HashMap<OutputMapping, HashMap<String, String>> outputMappingsForPredicatesAdd = new HashMap<>();
711             HashMap<OutputMapping, HashMap<String, String>> outputMappingsForRoles = new HashMap<>();
712             for (OutputMapping outputMapping : OutputMapping.values()) {
713                 outputMappingsForPredicates.put(outputMapping, new HashMap<>());
714                 outputMappingsForPredicatesAdd.put(outputMapping, new HashMap<>());
715                 outputMappingsForRoles.put(outputMapping, new HashMap<>());
716             }
717 
718             File frameFile = new File(outputPattern + "-frames.ser");
719             File rolesFile = new File(outputPattern + "-roles.ser");
720             File addFile = new File(outputPattern + "-add.ser");
721 
722             if (frameFile.exists() && rolesFile.exists() && addFile.exists()) {
723                 LOGGER.info("Loading mappings from files");
724                 outputMappingsForPredicates = (HashMap<OutputMapping, HashMap<String, String>>) loadObjectFromFile(
725                         frameFile);
726                 outputMappingsForRoles = (HashMap<OutputMapping, HashMap<String, String>>) loadObjectFromFile(
727                         rolesFile);
728                 outputMappingsForPredicatesAdd = (HashMap<OutputMapping, HashMap<String, String>>) loadObjectFromFile(
729                         addFile);
730             } else {
731                 for (RolesetInfo rolesetInfo : rolesets) {
732 
733                     Roleset roleset = rolesetInfo.getRoleset();
734                     String rolesetID = rolesetInfo.getLabel();
735                     HashMap<String, HashMap<String, Set>> senses = rolesetInfo.getSenses();
736                     List<String> synsets = rolesetInfo.getSynsets();
737                     String lemma = rolesetInfo.getLemma();
738                     String baseLemma = rolesetInfo.getBaseLemma();
739                     Set<String> luFrames = rolesetInfo.getLuFrames();
740                     String type = rolesetInfo.getType();
741 
742                     String frameNet = roleset.getFramnet();
743 
744                     if (frameNet != null) {
745                         frameNet = frameNet.toLowerCase();
746                     }
747 
748                     LOGGER.debug(rolesetID);
749 
750                     ArrayList<String> fnFrames = new ArrayList<>();
751                     if (frameNet != null) {
752                         String[] fns = frameNet.split("\\s+");
753                         for (String fn : fns) {
754                             if (fn.length() == 0) {
755                                 continue;
756                             }
757                             fnFrames.add(fn);
758                         }
759                     }
760                     fnFrames.retainAll(existingFrames);
761 
762                     if (enableSemLinkForPredicates && fnFrames.size() == 0) {
763                         String vnClasses = roleset.getVncls();
764                         if (vnClasses != null) {
765                             vnClasses = vnClasses.trim();
766                             String[] parts = vnClasses.split("\\s+");
767                             for (String part : parts) {
768                                 Set<String> frames = verbnetToFramenet.get(part);
769                                 if (frames != null) {
770                                     fnFrames = new ArrayList<>(frames);
771                                 }
772                             }
773                         }
774                     }
775 
776                     Collection<String> wnFromSenses = new HashSet<>();
777                     Collection<String> fnFromSenses = new HashSet<>();
778                     if (senses.get(rolesetID) != null) {
779                         wnFromSenses = senses.get(rolesetID).get("wn");
780                         fnFromSenses = senses.get(rolesetID).get("fn");
781                     }
782                     fnFromSenses.retainAll(existingFrames);
783 
784 //                                    System.out.println(synsets);
785 //                                    System.out.println(wnFromSenses);
786 
787                     Collection<String> wnCandidates = getIntersection(synsets, wnFromSenses);
788 
789                     boolean useBaseLemma = false;
790                     String lemmaToUse = lemma;
791 
792                     if (!lemma.equals(baseLemma)) {
793                         if (synsets.size() + wnFromSenses.size() == 0) {
794                             useBaseLemma = true;
795                         }
796                         for (String wnCandidate : wnCandidates) {
797                             Set<String> lemmas = WordNet.getLemmas(wnCandidate);
798                             if (lemmas.contains(baseLemma)) {
799                                 useBaseLemma = true;
800                             }
801                         }
802 
803                         if (useBaseLemma && luFrames.size() != 0) {
804                             LOGGER.debug("Base lemma should be used, but lexical unit found ({})",
805                                     rolesetID);
806                             useBaseLemma = false;
807                         }
808                     }
809 
810                     Set<String> luFramesToUse = new HashSet<>(luFrames);
811 
812                     if (useBaseLemma) {
813                         LOGGER.debug("Using base lemma");
814                         lemmaToUse = baseLemma;
815                         luFramesToUse = lus.get(type).get(baseLemma);
816 
817                         List<String> newSynsets = WordNet
818                                 .getSynsetsForLemma(baseLemma.replace('+', ' '), type);
819                         wnCandidates = getIntersection(wnCandidates, newSynsets);
820                     }
821 
822                     Collection<String> fnCandidates = getIntersection(fnFrames, luFramesToUse,
823                             fnFromSenses);
824 
825                     Collection<String> fnCandidatesOnlySemLink = getIntersection(fnFrames,
826                             fnFromSenses);
827                     if (fnCandidatesOnlySemLink.size() == 1) {
828                         semlinkCounter++;
829                     }
830 
831                     Collection<String> okFrames = getCandidateFrames(wnCandidates, fnCandidates,
832                             lemmaToUse,
833                             type, fbFramenetToWordNet, ignoreLemmaInFrameBaseMappings);
834 
835 //                                    if (rolesetID.equals("add.04")) {
836 //                                        System.out.println(synsets);
837 //                                        System.out.println(wnFromSenses);
838 //
839 //                                        System.out.println(fnFrames);
840 //                                        System.out.println(luFramesToUse);
841 //                                        System.out.println(fnFromSenses);
842 //
843 //                                        System.out.println(wnCandidates);
844 //                                        System.out.println(fnCandidates);
845 //
846 //                                        System.out.println(lemmaToUse);
847 //                                    }
848 
849                     if (fnCandidatesOnlySemLink.size() == 1 && okFrames.size() == 0) {
850                         for (String fnCandidate : fnCandidates) {
851                             outputMappingsForPredicatesAdd.get(OutputMapping.PBauto)
852                                     .put(rolesetID, fnCandidate);
853                             noFrameBaseCount++;
854                         }
855                     }
856 
857                     // If Fp’ contains a singleton frame f, then we align p to f.
858                     // Otherwise we avoid any alignment.
859                     if (okFrames.size() == 1) {
860                         for (String okFrame : okFrames) {
861                             if (fnFrames.size() == 1 && fnFrames.contains(okFrame)) {
862                                 trivialCount++;
863                                 outputMappingsForPredicates.get(OutputMapping.PBtrivial)
864                                         .put(rolesetID, okFrame);
865                                 continue;
866                             }
867                             nonTrivialCount++;
868 
869                             outputMappingsForPredicates.get(OutputMapping.PBauto)
870                                     .put(rolesetID, okFrame);
871                         }
872                     }
873 
874                     // NomBank
875                     Set<Roleset> fRolesets = nbFrames.get(rolesetID);
876                     for (Roleset nbRoleset : fRolesets) {
877 
878                         // See bad choice above
879                         String nbLemma = nbRoleset.getName();
880 
881                         List<String> nbSynsets = WordNet
882                                 .getSynsetsForLemma(nbLemma.replace('+', ' '), "n");
883 
884                         Set<String> relatedSynsets = new HashSet<>();
885                         for (String wnCandidate : wnCandidates) {
886                             relatedSynsets
887                                     .addAll(WordNet.getGenericSet(wnCandidate, PointerType.DERIVED,
888                                             PointerType.NOMINALIZATION, PointerType.PARTICIPLE_OF,
889                                             PointerType.PERTAINYM));
890                         }
891 
892                         if (relatedSynsets.size() == 0) {
893                             emptyRelatedCount++;
894                         }
895 
896                         Set<String> luNbFrames = lus.get("n").get(nbLemma);
897                         Collection<String> fnNbCandidates = getIntersection(fnFrames, luFrames,
898                                 fnFromSenses, luNbFrames);
899 
900                         Collection<String> nbCandidates = getIntersection(nbSynsets, relatedSynsets);
901                         Collection<String> okNbFrames = getCandidateFrames(nbCandidates, fnNbCandidates,
902                                 nbLemma, "n", fbFramenetToWordNet, ignoreLemmaInFrameBaseMappings);
903 
904                         // If Fp’ contains a singleton frame f, then we align p to f.
905                         // Otherwise we avoid any alignment.
906                         if (okNbFrames.size() == 1) {
907                             for (String okFrame : okNbFrames) {
908                                 nbCount++;
909                                 outputMappingsForPredicates.get(OutputMapping.NBauto)
910                                         .put(nbRoleset.getId(), okFrame);
911                             }
912                         }
913                         if (okNbFrames.size() > 1) {
914                             nbGreaterCount++;
915                         }
916                         if (okNbFrames.size() == 0) {
917                             nbZeroCount++;
918                         }
919                     }
920                 }
921 
922                 // Unlinked NomBank
923                 for (Roleset nbRoleset : nbUnlinked) {
924 
925                     // See bad choice above
926                     String nbLemma = nbRoleset.getName();
927                     List<String> nbSynsets = WordNet.getSynsetsForLemma(nbLemma.replace('+', ' '), "n");
928 
929                     if (nbSynsets.size() == 1) {
930                         Set<String> frames = lus.get("n").get(nbLemma);
931                         if (frames != null && frames.size() == 1) {
932                             for (String frame : frames) {
933                                 outputMappingsForPredicates.get(OutputMapping.NBresource).put(nbRoleset.getId(), frame);
934                             }
935 
936                             unlinkedCount++;
937                         }
938                     } else {
939                         //todo: check senses
940                     }
941                 }
942 
943                 // Looping for roles
944                 for (RolesetInfo rolesetInfo : rolesets) {
945                     Roleset roleset = rolesetInfo.getRoleset();
946                     String rolesetID = rolesetInfo.getLabel();
947 
948                     for (Object roles : roleset.getNoteOrRolesOrExample()) {
949                         if (!(roles instanceof Roles)) {
950                             continue;
951                         }
952 
953                         for (Object role : ((Roles) roles).getNoteOrRole()) {
954                             if (!(role instanceof eu.fbk.dkm.pikes.resources.util.propbank.Role)) {
955                                 continue;
956                             }
957 
958                             String n = ((eu.fbk.dkm.pikes.resources.util.propbank.Role) role).getN();
959                             String roleStr = rolesetID + "@" + n;
960 
961                             HashSet<String> tempMappingsForRole = new HashSet<>();
962 
963                             for (Vnrole vnrole : ((eu.fbk.dkm.pikes.resources.util.propbank.Role) role)
964                                     .getVnrole()) {
965                                 String vnClassRole = vnrole.getVncls().toLowerCase();
966                                 String vnThetaRole =
967                                         vnClassRole + "@" + vnrole.getVntheta().toLowerCase();
968 
969                                 Set<String> fnFrames = verbnetToFramenet
970                                         .get(vnThetaRole);
971                                 tempMappingsForRole.addAll(fnFrames);
972                             }
973 
974                             if (tempMappingsForRole.size() == 1) {
975                                 for (String frameRole : tempMappingsForRole) {
976 
977                                     String frameName = frameRole.replaceAll("@.*", "");
978                                     String goodCandidate;
979 
980                                     // Check for inconsistencies
981                                     goodCandidate = outputMappingsForPredicates.get(OutputMapping.PBauto)
982                                             .get(rolesetID);
983                                     if (goodCandidate == null || !goodCandidate.equals(frameName)) {
984                                         continue;
985                                     }
986 
987                                     outputMappingsForRoles.get(OutputMapping.PBauto).put(roleStr, frameRole);
988                                     roleMappingCount++;
989 
990                                     // NomBank
991                                     Set<Roleset> fRolesets = nbFrames.get(rolesetID);
992                                     for (Roleset nbRoleset : fRolesets) {
993 
994                                         String nbRolesetID = nbRoleset.getId();
995 
996                                         boolean isGoodCandidate = false;
997                                         goodCandidate = outputMappingsForPredicates.get(OutputMapping.NBauto)
998                                                 .get(nbRolesetID);
999                                         if (goodCandidate == null || !goodCandidate.equals(frameName)) {
1000                                             isGoodCandidate = true;
1001                                         }
1002                                         goodCandidate = outputMappingsForPredicates.get(OutputMapping.NBresource)
1003                                                 .get(nbRolesetID);
1004                                         if (goodCandidate == null || !goodCandidate.equals(frameName)) {
1005                                             isGoodCandidate = true;
1006                                         }
1007 
1008                                         if (!isGoodCandidate) {
1009                                             continue;
1010                                         }
1011 
1012                                         String correctN = n;
1013                                         HashMap<String, String> mappings = nomBankToProbBankRoles.get(nbRolesetID);
1014                                         if (mappings != null) {
1015                                             if (mappings.get(n) != null) {
1016                                                 correctN = mappings.get(n);
1017                                                 LOGGER.debug("Editing role...");
1018                                             }
1019                                         }
1020 
1021                                         String nbRoleStr = nbRolesetID + "@" + correctN;
1022 
1023                                         outputMappingsForRoles.get(OutputMapping.NBauto).put(nbRoleStr, frameRole);
1024                                         nbRoleMappingCount++;
1025                                     }
1026                                 }
1027                             }
1028                         }
1029                     }
1030                 }
1031 
1032                 LOGGER.info("*** STATISTICS ***");
1033 
1034                 LOGGER.info("PropBank trivial: {}", trivialCount);
1035                 LOGGER.info("PropBank non-trivial: {}", nonTrivialCount);
1036                 LOGGER.info("PropBank non-FrameBase: {}", noFrameBaseCount);
1037 
1038                 LOGGER.info("NomBank (linked): {}", nbCount);
1039                 LOGGER.info("NomBank (unlinked): {}", unlinkedCount);
1040                 LOGGER.info("NomBank (total): {}", unlinkedCount + nbCount);
1041 
1042                 LOGGER.info("PropBank (only with SemLink): {}", semlinkCounter);
1043 
1044                 LOGGER.info("PropBank roles (with SemLink): {}", roleMappingCount);
1045                 LOGGER.info("NomBank roles (with SemLink): {}", nbRoleMappingCount);
1046 
1047                 LOGGER.info("No WordNet relations: {}", emptyRelatedCount);
1048                 LOGGER.info("More than one frame: {}", nbGreaterCount);
1049                 LOGGER.info("Zero frames: {}", nbZeroCount);
1050             }
1051 
1052 
1053             // Parsing examples (exampleSentences)
1054 
1055             LOGGER.info("Parsing examples");
1056 
1057             HashMap<String, FrequencyHashSet<String>> rolesCountByType = new HashMap<>();
1058             FrequencyHashSet<String> rolesCount = new FrequencyHashSet<>();
1059             int usedSentences = 0;
1060 
1061             for (Sentence sentence : exampleSentences) {
1062                 HashMap<Word, HashMap<String, Srl>> srlIndex = new HashMap<>();
1063 
1064                 for (Srl srl : sentence.getSrls()) {
1065                     Word target = srl.getTarget().get(0);
1066 
1067                     // Only verbs and nouns
1068                     if (!target.getPos().toLowerCase().startsWith("v") && !target.getPos().toLowerCase()
1069                             .startsWith("n")) {
1070                         continue;
1071                     }
1072 
1073                     if (!srlIndex.containsKey(target)) {
1074                         srlIndex.put(target, new HashMap<>());
1075                     }
1076                     srlIndex.get(target).put(srl.getSource(), srl);
1077                 }
1078 
1079                 for (Word word : srlIndex.keySet()) {
1080                     if (srlIndex.get(word).size() > 1) {
1081 
1082                         usedSentences++;
1083 
1084                         Srl srlFrameNet = srlIndex.get(word).get("framenet");
1085                         Srl srlMate = srlIndex.get(word).get("mate");
1086 
1087                         String framenet = srlFrameNet.getLabel();
1088                         String mate = srlMate.getLabel();
1089 
1090                         boolean isVerb = true;
1091                         if (word.getPos().toLowerCase().startsWith("n")) {
1092                             isVerb = false;
1093                         }
1094 
1095                         boolean mappingExists = false;
1096                         String frameGuess;
1097 
1098                         if (isVerb) {
1099                             frameGuess = outputMappingsForPredicates.get(OutputMapping.PBauto).get(mate);
1100                             if (frameGuess != null && frameGuess.equals(framenet)) {
1101                                 mappingExists = true;
1102                             }
1103                             frameGuess = outputMappingsForPredicates.get(OutputMapping.PBtrivial).get(mate);
1104                             if (frameGuess != null && frameGuess.equals(framenet)) {
1105                                 mappingExists = true;
1106                             }
1107                         } else {
1108                             frameGuess = outputMappingsForPredicates.get(OutputMapping.NBauto).get(mate);
1109                             if (frameGuess != null && frameGuess.equals(framenet)) {
1110                                 mappingExists = true;
1111                             }
1112                             frameGuess = outputMappingsForPredicates.get(OutputMapping.NBresource).get(mate);
1113                             if (frameGuess != null && frameGuess.equals(framenet)) {
1114                                 mappingExists = true;
1115                             }
1116                         }
1117 
1118                         if (mappingExists) {
1119 
1120                             HashMap<Word, String> roleWordsMate = new HashMap<>();
1121                             HashMap<Word, String> roleWordsFrameNet = new HashMap<>();
1122 
1123                             // Mate
1124                             for (eu.fbk.fcw.utils.corpus.Role role : srlMate.getRoles()) {
1125                                 Word roleHead = role.getSpan().get(0);
1126                                 String roleLabel = role.getLabel();
1127                                 roleLabel = roleLabel.replaceAll("R-", "");
1128 
1129                                 // Consider only core roles
1130                                 if (roleLabel.startsWith("AM-")) {
1131                                     continue;
1132                                 }
1133 
1134                                 roleWordsMate.put(roleHead, roleLabel);
1135                             }
1136 
1137                             // FrameNet
1138                             for (eu.fbk.fcw.utils.corpus.Role role : srlFrameNet.getRoles()) {
1139                                 Word roleHead = role.getSpan().get(0);
1140                                 String roleLabel = role.getLabel();
1141                                 roleWordsFrameNet.put(roleHead, roleLabel);
1142                             }
1143 
1144                             for (Word key : roleWordsMate.keySet()) {
1145                                 String prefix = isVerb ? "v-" : "n-";
1146                                 String mateCompressed =
1147                                         prefix + mate + "@" + roleWordsMate.get(key).replaceAll("[aA]", "");
1148                                 rolesCount.add(mateCompressed);
1149 
1150                                 if (!rolesCountByType.containsKey(mateCompressed)) {
1151                                     rolesCountByType.put(mateCompressed, new FrequencyHashSet<>());
1152                                 }
1153 
1154                                 String fnRole = roleWordsFrameNet.get(key);
1155                                 if (fnRole != null) {
1156                                     fnRole = fnRole.toLowerCase();
1157                                     String fnCompressed = framenet + "@" + fnRole;
1158                                     rolesCountByType.get(mateCompressed).add(fnCompressed);
1159                                 } else {
1160                                     rolesCountByType.get(mateCompressed).add("[none]");
1161                                 }
1162                             }
1163 
1164 //                            for (eu.fbk.fcw.utils.corpus.Role role : srlFrameNet.getRoles()) {
1165 //                                Word roleHead = role.getSpan().get(0);
1166 //                                if (roleWords.containsKey(roleHead)) {
1167 //                                    String thisMateRole = mate + "@" + roleWords.get(roleHead);
1168 //                                    String thisFrameNetRole = framenet + "@" + role.getLabel();
1169 //
1170 //                                    if (!pbToFn.containsKey(thisMateRole)) {
1171 //                                        pbToFn.put(thisMateRole, new FrequencyHashSet<>());
1172 //                                    }
1173 //                                    pbToFn.get(thisMateRole).add(thisFrameNetRole);
1174 //                                }
1175 //                            }
1176                         } else {
1177                             // These *can* be good mappings
1178                         }
1179 
1180 //                        if (!fnToPb.containsKey(framenet)) {
1181 //                            fnToPb.put(framenet, new FrequencyHashSet<>());
1182 //                        }
1183 //                        if (!pbToFn.containsKey(mate)) {
1184 //                            pbToFn.put(mate, new FrequencyHashSet<>());
1185 //                        }
1186 //
1187 //                        fnToPb.get(framenet).add(mate);
1188 //                        pbToFn.get(mate).add(framenet);
1189                     }
1190                 }
1191             }
1192 
1193             LOGGER.info("Used sentences: {}", usedSentences);
1194 
1195 
1196             // Evaluate and save
1197 
1198             double okThreshold = 0.5;
1199             int okMinFreq = 2;
1200             HashMap<OutputMapping, HashMap<String, String>> outputMappingsForRolesFromExamples = new HashMap<>();
1201             for (OutputMapping outputMapping : OutputMapping.values()) {
1202                 outputMappingsForRolesFromExamples.put(outputMapping, new HashMap<>());
1203             }
1204 
1205             for (double threshold = 0.5; threshold < 1; threshold += 0.1) {
1206                 for (int minFreq = 2; minFreq <= 10; minFreq++) {
1207                     int trivialMappingsCount = 0;
1208                     int correctMappingsCount = 0;
1209                     int wrongMappingsCount = 0;
1210 
1211                     HashMap<OutputMapping, HashMap<String, String>> outputMappingsForRolesFromExamplesTemp = new HashMap<>();
1212                     for (OutputMapping outputMapping : OutputMapping.values()) {
1213                         outputMappingsForRolesFromExamplesTemp.put(outputMapping, new HashMap<>());
1214                     }
1215 
1216                     for (String key : rolesCount.keySet()) {
1217 
1218                         String candidate = rolesCountByType.get(key).mostFrequent();
1219                         int freq = rolesCountByType.get(key).get(candidate);
1220                         double ratio = 0.0;
1221                         if (candidate != null && !candidate.equals("[none]")) {
1222                             ratio = (double) freq / (double) rolesCount.get(key);
1223                         } else {
1224                             candidate = null;
1225                         }
1226                         if (ratio > threshold && freq >= minFreq) {
1227                             String mate = key.substring(2);
1228                             OutputMapping mapping = key.startsWith("v") ? OutputMapping.PBauto : OutputMapping.NBauto;
1229                             outputMappingsForRolesFromExamplesTemp.get(mapping).put(mate, candidate);
1230 
1231                             // Save one version
1232                             if (Math.abs(okThreshold - threshold) < 0.01 && minFreq == okMinFreq) {
1233                                 outputMappingsForRolesFromExamples.get(mapping).put(mate, candidate);
1234                             }
1235 
1236                             String fnRole;
1237 
1238                             switch (mapping) {
1239                             case PBauto:
1240                                 fnRole = outputMappingsForRoles.get(OutputMapping.PBauto).get(mate);
1241                                 if (fnRole != null) {
1242                                     trivialMappingsCount++;
1243                                     if (fnRole.equals(candidate)) {
1244                                         correctMappingsCount++;
1245                                     } else {
1246                                         wrongMappingsCount++;
1247                                     }
1248                                 }
1249                                 break;
1250                             case NBauto:
1251                                 fnRole = outputMappingsForRoles.get(OutputMapping.NBauto).get(mate);
1252                                 if (fnRole != null) {
1253                                     trivialMappingsCount++;
1254                                     if (fnRole.equals(candidate)) {
1255                                         correctMappingsCount++;
1256                                     } else {
1257                                         wrongMappingsCount++;
1258                                     }
1259                                 }
1260                                 break;
1261                             }
1262                         }
1263                     }
1264 
1265 //                    LOGGER.info("Trivial role mappings: {}", trivialMappingsCount);
1266 //                    LOGGER.info("Correct role mappings: {}", correctMappingsCount);
1267 //                    LOGGER.info("Wrong role mappings: {}", wrongMappingsCount);
1268 
1269                     if (printPRTable) {
1270                         int tp = correctMappingsCount;
1271                         int fp = wrongMappingsCount;
1272                         int fn = outputMappingsForRoles.get(OutputMapping.PBauto).size() - trivialMappingsCount + fp;
1273                         double precision = (double) tp / (double) (tp + fp);
1274                         double recall = (double) tp / (double) (tp + fn);
1275                         double f1 = 2 * (precision * recall) / (precision + recall);
1276                         System.out.println(String.format(
1277                                         "%5f %5d %5d %5d %5d %5d %5d %5f %5f %5f",
1278                                         threshold,
1279                                         minFreq,
1280                                         outputMappingsForRolesFromExamplesTemp.get(OutputMapping.PBauto).size(),
1281                                         outputMappingsForRolesFromExamplesTemp.get(OutputMapping.NBauto).size(),
1282                                         trivialMappingsCount,
1283                                         correctMappingsCount,
1284                                         wrongMappingsCount,
1285                                         precision,
1286                                         recall,
1287                                         f1
1288                                 )
1289                         );
1290                     }
1291                 }
1292             }
1293 
1294 
1295             // Write files
1296 
1297             outputFile = new File(outputPattern + "-frames.tsv");
1298             LOGGER.info("Writing output file {}", outputFile.getName());
1299             writer = new BufferedWriter(new FileWriter(outputFile));
1300             for (OutputMapping outputMapping : outputMappingsForPredicates.keySet()) {
1301                 for (String key : outputMappingsForPredicates.get(outputMapping).keySet()) {
1302                     String value = outputMappingsForPredicates.get(outputMapping).get(key);
1303 
1304                     IRI fbIRI = null;
1305                     FrameBase.POS pos = FrameBase.POS.VERB;
1306                     switch (outputMapping) {
1307                     case NBauto:
1308                     case NBresource:
1309                         pos = FrameBase.POS.NOUN;
1310                         break;
1311                     }
1312 
1313                     String lemma = predicateToLemma.get(key);
1314                     if (lemma != null) {
1315                         fbIRI = FrameBase.classFor(value, lemma, pos);
1316                     }
1317                     if (fbIRI == null) {
1318                         lemma = key.substring(0, key.length() - 3);
1319                         fbIRI = FrameBase.classFor(value, lemma, pos);
1320                     }
1321 
1322                     // Should never happen
1323                     if (fbIRI == null) {
1324                         LOGGER.error("This should never happen!");
1325                         LOGGER.debug(value);
1326                         LOGGER.debug(key);
1327                         LOGGER.debug(key.substring(0, key.length() - 3));
1328                         LOGGER.debug(lemma);
1329                         break;
1330                     }
1331 
1332                     writer.append(outputMapping.toString()).append('\t');
1333                     writer.append(key).append('\t');
1334                     writer.append(value).append('\n');
1335 
1336                     writerFrames.append(outputMapping.toString().substring(0, 2).toLowerCase()).append(':');
1337                     writerFrames.append(key).append('\t');
1338                     writerFrames.append(lemma).append('\t');
1339                     writerFrames.append(pos.getLetter()).append('\t');
1340                     writerFrames.append(fbIRI.toString()).append('\n');
1341                 }
1342             }
1343             writer.close();
1344             if (saveFiles) {
1345                 outputFile = new File(outputPattern + "-frames.ser");
1346                 saveObjectToFile(outputMappingsForPredicates, outputFile);
1347             }
1348 
1349             outputFile = new File(outputPattern + "-roles.tsv");
1350             LOGGER.info("Writing output file {}", outputFile.getName());
1351             writer = new BufferedWriter(new FileWriter(outputFile));
1352             for (OutputMapping outputMapping : outputMappingsForRoles.keySet()) {
1353                 for (String key : outputMappingsForRoles.get(outputMapping).keySet()) {
1354                     String value = outputMappingsForRoles.get(outputMapping).get(key);
1355 
1356                     String[] parts = value.split("@");
1357                     if (parts.length < 2) {
1358                         LOGGER.error("This is impossible!");
1359                         break;
1360                     }
1361                     IRI fbIRI = FrameBase.propertyFor(parts[0], parts[1]);
1362                     if (fbIRI == null) {
1363                         LOGGER.error("This should never happen!");
1364                         LOGGER.debug(key);
1365                         LOGGER.debug(value);
1366                         break;
1367                     }
1368 
1369                     writer.append(outputMapping.toString()).append('\t');
1370                     writer.append(key).append('\t');
1371                     writer.append(value).append('\t');
1372 
1373                     writerRoles.append(outputMapping.toString().substring(0, 2).toLowerCase()).append(':');
1374                     writerRoles.append(key).append('\t');
1375                     writerRoles.append(fbIRI.toString()).append('\n');
1376                 }
1377             }
1378             writer.close();
1379             if (saveFiles) {
1380                 outputFile = new File(outputPattern + "-roles.ser");
1381                 saveObjectToFile(outputMappingsForRoles, outputFile);
1382             }
1383 
1384             if (outputMappingsForRolesFromExamples != null) {
1385                 outputFile = new File(outputPattern + "-roles-examples.tsv");
1386                 LOGGER.info("Writing output file {}", outputFile.getName());
1387                 writer = new BufferedWriter(new FileWriter(outputFile));
1388                 for (OutputMapping outputMapping : outputMappingsForRolesFromExamples.keySet()) {
1389                     for (String key : outputMappingsForRolesFromExamples.get(outputMapping).keySet()) {
1390                         String value = outputMappingsForRolesFromExamples.get(outputMapping).get(key);
1391 
1392                         String[] parts = value.split("@");
1393                         if (parts.length < 2) {
1394                             LOGGER.error("This is impossible!");
1395                             break;
1396                         }
1397                         IRI fbIRI = FrameBase.propertyFor(parts[0], parts[1]);
1398                         if (fbIRI == null) {
1399                             LOGGER.error("This should never happen!");
1400                             LOGGER.debug(key);
1401                             LOGGER.debug(value);
1402                             break;
1403                         }
1404 
1405                         writer.append(outputMapping.toString()).append('\t');
1406                         writer.append(key).append('\t');
1407                         writer.append(value).append('\n');
1408 
1409                         writerRoles.append(outputMapping.toString().substring(0, 2).toLowerCase()).append(':');
1410                         writerRoles.append(key).append('\t');
1411                         writerRoles.append(fbIRI.toString()).append('\n');
1412                     }
1413                 }
1414                 writer.close();
1415             }
1416 
1417             outputFile = new File(outputPattern + "-add.tsv");
1418             LOGGER.info("Writing output file {}", outputFile.getName());
1419             writer = new BufferedWriter(new FileWriter(outputFile));
1420             for (OutputMapping outputMapping : outputMappingsForPredicatesAdd.keySet()) {
1421                 for (String key : outputMappingsForPredicatesAdd.get(outputMapping).keySet()) {
1422                     String value = outputMappingsForPredicatesAdd.get(outputMapping).get(key);
1423 
1424                     writer.append(outputMapping.toString()).append('\t');
1425                     writer.append(key).append('\t');
1426                     writer.append(value).append('\n');
1427                 }
1428             }
1429             writer.close();
1430             if (saveFiles) {
1431                 outputFile = new File(outputPattern + "-add.ser");
1432                 saveObjectToFile(outputMappingsForPredicatesAdd, outputFile);
1433             }
1434 
1435             if (saveFiles) {
1436                 outputFile = new File(outputPattern + "-lu-existingFrames.ser");
1437                 if (!outputFile.exists()) {
1438                     LOGGER.info("Writing object file {}", outputFile.getName());
1439                     saveObjectToFile(existingFrames, outputFile);
1440                 }
1441 
1442                 outputFile = new File(outputPattern + "-lu-lus.ser");
1443                 if (!outputFile.exists()) {
1444                     LOGGER.info("Writing object file {}", outputFile.getName());
1445                     saveObjectToFile(lus, outputFile);
1446                 }
1447 
1448                 outputFile = new File(outputPattern + "-lu-exampleSentences.ser");
1449                 if (!outputFile.exists()) {
1450                     LOGGER.info("Writing object file {}", outputFile.getName());
1451                     saveObjectToFile(exampleSentences, outputFile);
1452                 }
1453             }
1454 
1455             writerFrames.close();
1456             writerRoles.close();
1457 
1458         } catch (Exception e) {
1459             CommandLine.fail(e);
1460         }
1461     }
1462 
1463     private static Object loadObjectFromFile(File inputFile) throws IOException {
1464         ObjectInputStream objectinputstream = null;
1465         FileInputStream streamIn = null;
1466         try {
1467             streamIn = new FileInputStream(inputFile);
1468             objectinputstream = new ObjectInputStream(streamIn);
1469             return objectinputstream.readObject();
1470         } catch (Exception e) {
1471             e.printStackTrace();
1472         } finally {
1473             if (objectinputstream != null) {
1474                 objectinputstream.close();
1475             }
1476         }
1477         return null;
1478     }
1479 
1480     private static void saveObjectToFile(Object o, File outputFile) throws IOException {
1481         ObjectOutputStream oos = null;
1482         FileOutputStream fout = null;
1483         try {
1484             fout = new FileOutputStream(outputFile);
1485             oos = new ObjectOutputStream(fout);
1486             oos.writeObject(o);
1487         } catch (Exception e) {
1488             e.printStackTrace();
1489         } finally {
1490             if (oos != null) {
1491                 oos.close();
1492             }
1493         }
1494     }
1495 
1496     private static List<Integer> getSpan(String text, LabelType label) {
1497         List<Integer> ret = new ArrayList<>();
1498 
1499         Integer start = label.getStart();
1500         if (start == null) {
1501             return null;
1502         }
1503 
1504         Integer end = label.getEnd();
1505         String before = text.substring(0, start);
1506         before = before.replaceAll("\\s+", " ");
1507         int target = before.replaceAll("[^\\s]", "").length();
1508         String inside = text.substring(start, end);
1509         inside = inside.replaceAll("\\s+", " ");
1510         int length = inside.replaceAll("[^\\s]", "").length() + 1;
1511 
1512         for (int i = 0; i < length; i++) {
1513             ret.add(target + i);
1514         }
1515 
1516         return ret;
1517     }
1518 
1519     private static HashMap<String, HashMap<String, Set>> getSenses(String name, File ontonotesFolder, String fnLemma,
1520             String type, Unmarshaller onUnmarshaller)
1521             throws JAXBException {
1522 
1523         HashMap<String, HashMap<String, Set>> senses = new HashMap<>();
1524 
1525         //todo: add type (for PB 1.7, for example)
1526         File onSense = new File(ontonotesFolder.getAbsolutePath() + File.separator + name);
1527         if (onSense.exists()) {
1528 
1529             Inventory inventory = (Inventory) onUnmarshaller.unmarshal(onSense);
1530             for (Sense sense : inventory.getSense()) {
1531 
1532                 if (sense.getMappings() == null) {
1533                     continue;
1534                 }
1535 
1536                 Set<String> onWn = new HashSet<>();
1537                 Set<String> onFn = new HashSet<>();
1538                 Set<String> onPb = new HashSet<>();
1539 
1540                 // PropBank
1541                 if (sense.getMappings().getPb() != null) {
1542                     String[] pbs = sense.getMappings().getPb().split(",");
1543                     for (String pb : pbs) {
1544                         pb = pb.trim();
1545                         if (pb.length() == 0) {
1546                             continue;
1547                         }
1548                         onPb.add(pb);
1549                     }
1550                 }
1551 
1552                 // FrameNet
1553                 if (sense.getMappings().getFn() != null) {
1554                     String[] fns = sense.getMappings().getFn().split(",");
1555                     for (String fn : fns) {
1556                         fn = fn.trim().toLowerCase();
1557                         if (fn.length() == 0) {
1558                             continue;
1559                         }
1560                         onFn.add(fn);
1561                     }
1562                 }
1563 
1564                 // WordNet
1565                 try {
1566                     for (Wn wn : sense.getMappings().getWn()) {
1567                         String lemma = wn.getLemma();
1568                         if (lemma == null || lemma.length() == 0) {
1569                             lemma = fnLemma;
1570                         }
1571                         String value = wn.getvalue();
1572                         String[] ids = value.split(",");
1573                         for (String id : ids) {
1574                             id = id.trim();
1575                             if (id.length() == 0) {
1576                                 continue;
1577                             }
1578                             String synsetID = WordNet.getSynsetID(lemma + "-" + id + type);
1579                             onWn.add(synsetID);
1580                         }
1581                     }
1582                 } catch (Exception e) {
1583                     // ignored
1584                 }
1585 
1586                 for (String pb : onPb) {
1587                     if (!senses.containsKey(pb)) {
1588                         senses.put(pb, new HashMap<>());
1589                     }
1590                     if (!senses.get(pb).containsKey("wn")) {
1591                         senses.get(pb).put("wn", new HashSet<>());
1592                     }
1593                     if (!senses.get(pb).containsKey("fn")) {
1594                         senses.get(pb).put("fn", new HashSet<>());
1595                     }
1596                     senses.get(pb).get("wn").addAll(onWn);
1597                     senses.get(pb).get("fn").addAll(onFn);
1598                 }
1599             }
1600         }
1601 
1602         return senses;
1603     }
1604 
1605     private static Collection<String> getCandidateFrames(Collection<String> wnCandidates,
1606             Collection<String> fnCandidates,
1607             String lemma, String type, HashMultimap<String, String> fbFramenetToWordNet,
1608             boolean ignoreLemmaInFrameBaseMappings) {
1609 
1610         Collection<String> okFrames = new HashSet<>();
1611         for (String fnCandidate : fnCandidates) {
1612             String key = getFrameBaseKey(fnCandidate, lemma, type, ignoreLemmaInFrameBaseMappings);
1613             Collection<String> wnCandidatesForThisFrame = new HashSet<>(fbFramenetToWordNet.get(key));
1614             wnCandidatesForThisFrame.retainAll(wnCandidates);
1615             if (wnCandidatesForThisFrame.size() > 0) {
1616                 okFrames.add(fnCandidate);
1617             }
1618         }
1619 
1620         return okFrames;
1621     }
1622 
1623     private static String getFrameBaseKey(String frame, String lemma, String type,
1624             boolean ignoreLemmaInFrameBaseMappings) {
1625         return getFrameBaseKey(frame, lemma + "." + type, ignoreLemmaInFrameBaseMappings);
1626     }
1627 
1628     private static String getFrameBaseKey(String frame, String lemma, boolean ignoreLemmaInFrameBaseMappings) {
1629         if (ignoreLemmaInFrameBaseMappings) {
1630             return frame;
1631         }
1632         return frame + "-" + lemma;
1633     }
1634 }