1 package eu.fbk.dkm.pikes.resources.ecb;
2
3 import com.google.common.collect.HashMultimap;
4 import com.google.common.io.Files;
5 import eu.fbk.utils.core.CommandLine;
6 import ixa.kaflib.*;
7 import org.apache.commons.io.FileUtils;
8 import org.slf4j.Logger;
9 import org.slf4j.LoggerFactory;
10
11 import javax.xml.parsers.DocumentBuilder;
12 import javax.xml.parsers.DocumentBuilderFactory;
13 import java.io.BufferedReader;
14 import java.io.File;
15 import java.io.FileInputStream;
16 import java.io.InputStreamReader;
17 import java.util.*;
18 import java.util.regex.Matcher;
19 import java.util.regex.Pattern;
20 import java.util.zip.GZIPInputStream;
21
22
23
24
25 public class ECBmerger {
26
27 private static final Logger LOGGER = LoggerFactory.getLogger(ECBmerger.class);
28 private static final Pattern mentionPattern = Pattern.compile("<([^>]*)>");
29 private static final Pattern chainPattern = Pattern.compile("CHAIN=\"([0-9]+)\"");
30
31 public static void main(String[] args) {
32 try {
33 final CommandLine cmd = CommandLine
34 .parser()
35 .withName("./taol-extractor")
36 .withHeader("Add mentions to NAF folder for ECB resource")
37 .withOption("i", "input-txt", "Input TXT folder", "FOLDER",
38 CommandLine.Type.DIRECTORY_EXISTING, true, false, true)
39 .withOption("n", "input-naf", "Input NAF folder", "FOLDER",
40 CommandLine.Type.DIRECTORY_EXISTING, true, false, true)
41 .withOption("o", "output", "Output folder", "FOLDER",
42 CommandLine.Type.DIRECTORY, true, false, true)
43 .withLogger(LoggerFactory.getLogger("eu.fbk")).parse(args);
44
45 File inputFolder = cmd.getOptionValue("input-txt", File.class);
46 File nafFolder = cmd.getOptionValue("input-naf", File.class);
47 File outputFolder = cmd.getOptionValue("output", File.class);
48
49 if (!outputFolder.exists()) {
50 outputFolder.mkdirs();
51 }
52
53 DocumentBuilderFactory dbFactory = DocumentBuilderFactory.newInstance();
54 DocumentBuilder dBuilder = dbFactory.newDocumentBuilder();
55
56
57
58
59 String tags;
60
61 int i = 0;
62 for (final File file : Files.fileTreeTraverser().preOrderTraversal(inputFolder)) {
63 if (!file.isFile()) {
64 continue;
65 }
66 if (file.getName().startsWith(".")) {
67 continue;
68 }
69
70 String path = file.getParentFile().toString();
71 String folder = path.substring(path.lastIndexOf("/"));
72 String local_name = folder + File.separator + file.getName();
73
74 String naf = nafFolder + File.separator + folder + File.separator + file.getName()
75 .replace("ecb.txt", "naf.gz");
76
77 FileInputStream bais = new FileInputStream(naf);
78 GZIPInputStream gzis = new GZIPInputStream(bais);
79 InputStreamReader reader = new InputStreamReader(gzis);
80 BufferedReader in = new BufferedReader(reader);
81
82 KAFDocument nafDocument = KAFDocument.createFromStream(in);
83
84
85
86
87
88
89
90
91 Map<Integer, Term> termsHashMap = new HashMap<>();
92 for (Term term : nafDocument.getTerms()) {
93 termsHashMap.put(term.getOffset(), term);
94 }
95
96 HashMultimap<String, Integer> clusterOffsets = HashMultimap.create();
97
98 String content = FileUtils.readFileToString(file, "utf-8");
99
100 Matcher matcher = mentionPattern.matcher(content);
101 int offset = 0;
102 int lastEnd = 0;
103 while (matcher.find()) {
104 offset += matcher.start() - lastEnd;
105 if (!matcher.group().startsWith("</")) {
106 Matcher chainMatcher = chainPattern.matcher(matcher.group());
107 if (!chainMatcher.find()) {
108 LOGGER.error("No chain found!");
109 continue;
110 }
111
112 String chain = chainMatcher.group(1);
113 clusterOffsets.put(chain, offset);
114 }
115 lastEnd = matcher.end();
116 }
117
118 for (String clusterId : clusterOffsets.keySet()) {
119 Set<Integer> terms = clusterOffsets.get(clusterId);
120 List<Span<Term>> termsList = new ArrayList<>();
121 for (Integer termOffset : terms) {
122 Term term = termsHashMap.get(termOffset);
123 if (term == null) {
124 LOGGER.error("Term is null!");
125 continue;
126 }
127 Span<Term> termSpan = KAFDocument.newTermSpan();
128 termSpan.addTarget(term);
129 termsList.add(termSpan);
130 }
131 Coref coref = nafDocument.newCoref(termsList);
132 coref.setCluster(clusterId);
133 coref.setType("event-gold");
134 }
135
136 File outputFile = new File(
137 outputFolder.getAbsolutePath() + File.separator +
138 file.getAbsolutePath().substring(
139 inputFolder.getAbsolutePath().length()).replace(".ecb.txt", ".naf"));
140 Files.createParentDirs(outputFile);
141 nafDocument.save(outputFile);
142
143 }
144
145 } catch (Exception e) {
146 CommandLine.fail(e);
147 }
148
149 }
150
151 }