1 package eu.fbk.dkm.pikes.resources.ecb;
2
3 import com.google.common.io.Files;
4 import eu.fbk.utils.core.CommandLine;
5 import ixa.kaflib.Coref;
6 import ixa.kaflib.KAFDocument;
7 import ixa.kaflib.Span;
8 import ixa.kaflib.Term;
9 import org.slf4j.Logger;
10 import org.slf4j.LoggerFactory;
11
12 import java.io.BufferedWriter;
13 import java.io.File;
14 import java.io.FileWriter;
15 import java.util.regex.Pattern;
16
17
18
19
20 public class ECBextractor {
21
22 private static final Logger LOGGER = LoggerFactory.getLogger(ECBextractor.class);
23 private static final Pattern tokenPattern = Pattern.compile("/([0-9]+)/([0-9])\\.ecb#char=([0-9]+)");
24
25
26
27 public static void main(String[] args) {
28 try {
29
30 final CommandLine cmd = CommandLine
31 .parser()
32 .withName("./ecb-extractor")
33 .withHeader("Extracts URI of events in the gold standard")
34 .withOption("n", "input-naf", "Input NAF folder", "FOLDER",
35 CommandLine.Type.DIRECTORY_EXISTING, true, false, true)
36 .withOption("o", "output", "Output file", "FILE",
37 CommandLine.Type.FILE, true, false, true)
38 .withLogger(LoggerFactory.getLogger("eu.fbk")).parse(args);
39
40 File inputNaf = cmd.getOptionValue("input-naf", File.class);
41 File outputFile = cmd.getOptionValue("output", File.class);
42
43 BufferedWriter writer = new BufferedWriter(new FileWriter(outputFile));
44
45 for (final File file : Files.fileTreeTraverser().preOrderTraversal(inputNaf)) {
46 if (!file.isFile()) {
47 continue;
48 }
49 if (file.getName().startsWith(".")) {
50 continue;
51 }
52
53 String path = file.getParentFile().toString();
54 Integer folder = Integer.parseInt(path.substring(path.lastIndexOf("/")).substring(1));
55 Integer fileNum = Integer.parseInt(file.getName().substring(0, file.getName().length() - 4));
56
57 LOGGER.debug(file.getAbsolutePath());
58 KAFDocument document = KAFDocument.createFromFile(file);
59 String uri = document.getPublic().uri;
60
61 for (Coref coref : document.getCorefs()) {
62 if (coref.getType() == null) {
63 continue;
64 }
65 if (!coref.getType().equals("event-gold")) {
66 continue;
67 }
68
69 Integer cluster = Integer.parseInt(coref.getCluster());
70 String idCluster = folder + "_" + cluster;
71
72 for (Span<Term> termSpan : coref.getSpans()) {
73 Term term = termSpan.getTargets().get(0);
74
75 String thisURI =
76 uri + "#char=" + term.getOffset() + "," + (term.getOffset() + term.getLength());
77 writer.append(thisURI).append("\n");
78 }
79 }
80 }
81
82 writer.close();
83
84 } catch (Exception e) {
85 CommandLine.fail(e);
86 }
87 }
88
89 }