1   package eu.fbk.dkm.pikes.resources.ecb;
2   
3   import com.google.common.io.Files;
4   import eu.fbk.utils.core.CommandLine;
5   import ixa.kaflib.Coref;
6   import ixa.kaflib.KAFDocument;
7   import ixa.kaflib.Span;
8   import ixa.kaflib.Term;
9   import org.slf4j.Logger;
10  import org.slf4j.LoggerFactory;
11  
12  import java.io.BufferedWriter;
13  import java.io.File;
14  import java.io.FileWriter;
15  import java.util.regex.Pattern;
16  
17  /**
18   * Created by marcorospocher on 12/03/16.
19   */
20  public class ECBextractor {
21  
22      private static final Logger LOGGER = LoggerFactory.getLogger(ECBextractor.class);
23      private static final Pattern tokenPattern = Pattern.compile("/([0-9]+)/([0-9])\\.ecb#char=([0-9]+)");
24  //    private static final Boolean removeAloneClusters = false;
25  //    private static final Pattern chainPattern = Pattern.compile("CHAIN=\"([0-9]+)\"");
26  
27      public static void main(String[] args) {
28          try {
29  
30              final CommandLine cmd = CommandLine
31                      .parser()
32                      .withName("./ecb-extractor")
33                      .withHeader("Extracts URI of events in the gold standard")
34                      .withOption("n", "input-naf", "Input NAF folder", "FOLDER",
35                              CommandLine.Type.DIRECTORY_EXISTING, true, false, true)
36                      .withOption("o", "output", "Output file", "FILE",
37                              CommandLine.Type.FILE, true, false, true)
38                      .withLogger(LoggerFactory.getLogger("eu.fbk")).parse(args);
39  
40              File inputNaf = cmd.getOptionValue("input-naf", File.class);
41              File outputFile = cmd.getOptionValue("output", File.class);
42  
43              BufferedWriter writer = new BufferedWriter(new FileWriter(outputFile));
44  
45              for (final File file : Files.fileTreeTraverser().preOrderTraversal(inputNaf)) {
46                  if (!file.isFile()) {
47                      continue;
48                  }
49                  if (file.getName().startsWith(".")) {
50                      continue;
51                  }
52  
53                  String path = file.getParentFile().toString();
54                  Integer folder = Integer.parseInt(path.substring(path.lastIndexOf("/")).substring(1));
55                  Integer fileNum = Integer.parseInt(file.getName().substring(0, file.getName().length() - 4));
56  
57                  LOGGER.debug(file.getAbsolutePath());
58                  KAFDocument document = KAFDocument.createFromFile(file);
59                  String uri = document.getPublic().uri;
60  
61                  for (Coref coref : document.getCorefs()) {
62                      if (coref.getType() == null) {
63                          continue;
64                      }
65                      if (!coref.getType().equals("event-gold")) {
66                          continue;
67                      }
68  
69                      Integer cluster = Integer.parseInt(coref.getCluster());
70                      String idCluster = folder + "_" + cluster;
71  
72                      for (Span<Term> termSpan : coref.getSpans()) {
73                          Term term = termSpan.getTargets().get(0);
74  
75                          String thisURI =
76                                  uri + "#char=" + term.getOffset() + "," + (term.getOffset() + term.getLength());
77                          writer.append(thisURI).append("\n");
78                      }
79                  }
80              }
81  
82              writer.close();
83  
84          } catch (Exception e) {
85              CommandLine.fail(e);
86          }
87      }
88  
89  }