1 package eu.fbk.dkm.pikes.resources.ecb;
2
3 import com.google.common.collect.HashMultimap;
4 import com.google.common.io.Files;
5 import eu.fbk.utils.core.CommandLine;
6 import ixa.kaflib.Coref;
7 import ixa.kaflib.KAFDocument;
8 import ixa.kaflib.Span;
9 import ixa.kaflib.Term;
10 import org.apache.commons.csv.CSVFormat;
11 import org.apache.commons.csv.CSVRecord;
12 import org.slf4j.Logger;
13 import org.slf4j.LoggerFactory;
14
15 import java.io.*;
16 import java.util.HashMap;
17 import java.util.HashSet;
18 import java.util.List;
19 import java.util.Set;
20 import java.util.regex.Matcher;
21 import java.util.regex.Pattern;
22
23
24
25
26 public class ECBevaluator {
27
28 private static final Logger LOGGER = LoggerFactory.getLogger(ECBevaluator.class);
29 private static final Pattern tokenPattern = Pattern.compile("/([0-9]+)/([0-9])\\.ecb#char=([0-9]+)");
30
31
32 private static Integer FOLDER = null;
33
34 public static void printToken(Appendable writer, Term token, int i, String last) throws IOException {
35 writer.append(String.format("%d", i)).append("\t");
36 writer.append(token.getForm()).append("\t");
37 writer.append("_").append("\t");
38 writer.append(token.getForm()).append("\t");
39 writer.append("_").append("\t");
40 writer.append(token.getMorphofeat()).append("\t");
41 writer.append("_").append("\t");
42 writer.append("_").append("\t");
43 writer.append("_").append("\t");
44 writer.append("_").append("\t");
45 writer.append("_").append("\t");
46 writer.append("_").append("\t");
47 writer.append("_").append("\t");
48 writer.append("_").append("\t");
49 writer.append("_").append("\t");
50 writer.append("_").append("\t");
51 writer.append(last);
52 writer.append("\n");
53
54 }
55
56 public static void main(String[] args) {
57 try {
58
59 final CommandLine cmd = CommandLine
60 .parser()
61 .withName("./ecb-evaluator")
62 .withHeader("Evaluator event extractor")
63 .withOption("n", "input-naf", "Input NAF folder", "FOLDER",
64 CommandLine.Type.DIRECTORY_EXISTING, true, false, true)
65 .withOption("i", "input-csv", "Input CSV file", "FILE",
66 CommandLine.Type.FILE_EXISTING, true, false, true)
67 .withOption("g", "output-gold", "Output gold file", "FILE",
68 CommandLine.Type.FILE, true, false, true)
69 .withOption("b", "output-baseline", "Output baseline file", "FILE",
70 CommandLine.Type.FILE, true, false, true)
71 .withOption("o", "output", "Output file", "FILE",
72 CommandLine.Type.FILE, true, false, true)
73 .withOption("l", "input-lemmas", "Lemmas CSV file", "FILE",
74 CommandLine.Type.FILE_EXISTING, true, false, false)
75
76 .withOption("c", "check-gold", "Use only events annotated in gold standard")
77 .withOption("s", "add-single", "Add single clusters")
78 .withLogger(LoggerFactory.getLogger("eu.fbk")).parse(args);
79
80 File inputCsv = cmd.getOptionValue("input-csv", File.class);
81 File inputNaf = cmd.getOptionValue("input-naf", File.class);
82 File inputLemmas = cmd.getOptionValue("input-lemmas", File.class);
83
84 File outputGold = cmd.getOptionValue("output-gold", File.class);
85 File outputBaseline = cmd.getOptionValue("output-baseline", File.class);
86 File output = cmd.getOptionValue("output", File.class);
87
88
89 Boolean checkGold = cmd.hasOption("check-gold");
90 Boolean addSingleClusters = cmd.hasOption("add-single");
91
92 Reader in;
93 Iterable<CSVRecord> records;
94
95 HashMap<String, Integer> lemmas = null;
96 int lemmaIndex = 0;
97 if (inputLemmas != null) {
98 lemmas = new HashMap<>();
99 in = new FileReader(inputLemmas);
100 records = CSVFormat.EXCEL.withHeader().parse(in);
101 for (CSVRecord record : records) {
102 String lemma = record.get(1);
103 lemma = lemma.replaceAll("\"", "").trim();
104 if (lemma.length() > 0) {
105 lemmas.put(lemma, ++lemmaIndex);
106 }
107 }
108 }
109
110 LOGGER.info("Lemmas: {}", lemmas.size());
111
112 BufferedWriter goldWriter = new BufferedWriter(new FileWriter(outputGold));
113 BufferedWriter baselineWriter = new BufferedWriter(new FileWriter(outputBaseline));
114 BufferedWriter writer = new BufferedWriter(new FileWriter(output));
115
116 HashMultimap<String, String> goldTmpClusters = HashMultimap.create();
117 HashMap<String, String> goldClusters = new HashMap<>();
118 Set<String> okEvents = new HashSet<>();
119
120 for (final File file : Files.fileTreeTraverser().preOrderTraversal(inputNaf)) {
121 if (!file.isFile()) {
122 continue;
123 }
124 if (file.getName().startsWith(".")) {
125 continue;
126 }
127
128 String path = file.getParentFile().toString();
129 Integer folder = Integer.parseInt(path.substring(path.lastIndexOf("/")).substring(1));
130 Integer fileNum = Integer.parseInt(file.getName().substring(0, file.getName().length() - 4));
131
132 if (FOLDER != null && !folder.equals(FOLDER)) {
133 continue;
134 }
135
136 LOGGER.debug(file.getAbsolutePath());
137 KAFDocument document = KAFDocument.createFromFile(file);
138
139 for (Coref coref : document.getCorefs()) {
140 if (coref.getType() == null) {
141 continue;
142 }
143 if (!coref.getType().equals("event-gold")) {
144 continue;
145 }
146
147 Integer cluster = Integer.parseInt(coref.getCluster());
148 String idCluster = String.valueOf(1000 * folder + cluster);
149
150 for (Span<Term> termSpan : coref.getSpans()) {
151 Term term = termSpan.getTargets().get(0);
152 String lemma = term.getLemma();
153 if (lemmas == null || lemmas.containsKey(lemma)) {
154 String text = folder + "_" + fileNum + "_" + term.getOffset();
155 goldTmpClusters.put(idCluster, text);
156 goldClusters.put(text, idCluster);
157 okEvents.add(text);
158 }
159 }
160 }
161
162 goldWriter.append(String.format("#begin document %d_%d", folder, fileNum)).append("\n");
163 baselineWriter.append(String.format("#begin document %d_%d", folder, fileNum)).append("\n");
164
165 Integer numSentences = document.getNumSentences();
166 for (int i = 1; i <= numSentences; i++) {
167
168 boolean useThis = false;
169 StringBuilder goldBuilder = new StringBuilder();
170 StringBuilder baselineBuilder = new StringBuilder();
171
172 List<Term> sentenceTerms = document.getSentenceTerms(i);
173 int n = 0;
174 for (Term token : sentenceTerms) {
175 String id = String.format("%d_%d_%d", folder, fileNum, token.getOffset());
176 String last;
177 n++;
178
179 last = "_";
180 if (goldClusters.containsKey(id)) {
181 last = String.format("(%s)", goldClusters.get(id));
182 useThis = true;
183 }
184 printToken(goldBuilder, token, n, last);
185
186 last = "_";
187 String lemma = token.getLemma();
188 if (goldClusters.containsKey(id) && lemmas.containsKey(lemma)) {
189 last = String.format("(%d)", lemmas.get(lemma));
190 }
191 printToken(baselineBuilder, token, n, last);
192 }
193
194 goldBuilder.append("\n");
195 baselineBuilder.append("\n");
196
197 if (useThis) {
198 goldWriter.append(goldBuilder.toString());
199 baselineWriter.append(baselineBuilder.toString());
200 }
201 }
202 }
203
204 goldWriter.close();
205 baselineWriter.close();
206
207
208
209
210
211
212
213
214
215
216
217 in = new FileReader(inputCsv);
218 records = CSVFormat.EXCEL.withHeader().parse(in);
219
220
221 int clusterID = 0;
222 HashMap<String, Integer> clusterIndexes = new HashMap<>();
223 HashMultimap<Integer, String> tmpClusters = HashMultimap.create();
224 for (CSVRecord record : records) {
225 Matcher matcher;
226
227 String id1 = null;
228 String id2 = null;
229 matcher = tokenPattern.matcher(record.get(1));
230 if (matcher.find()) {
231 id1 = matcher.group(1) + "_" + matcher.group(2) + "_" + matcher.group(3);
232 }
233 matcher = tokenPattern.matcher(record.get(3));
234 if (matcher.find()) {
235 id2 = matcher.group(1) + "_" + matcher.group(2) + "_" + matcher.group(3);
236 }
237
238 Integer index1 = clusterIndexes.get(id1);
239 Integer index2 = clusterIndexes.get(id2);
240
241 if (index1 == null && index2 == null) {
242 clusterID++;
243 if (!checkGold || okEvents.contains(id2)) {
244 if (id2 != null) {
245 tmpClusters.put(clusterID, id2);
246 clusterIndexes.put(id2, clusterID);
247 }
248 }
249 if (!checkGold || okEvents.contains(id1)) {
250 if (id1 != null) {
251 tmpClusters.put(clusterID, id1);
252 clusterIndexes.put(id1, clusterID);
253 }
254 }
255 }
256 if (index1 == null && index2 != null) {
257 if (!checkGold || okEvents.contains(id1)) {
258 if (id1 != null) {
259 tmpClusters.put(index2, id1);
260 clusterIndexes.put(id1, index2);
261 }
262 }
263 }
264 if (index2 == null && index1 != null) {
265 if (!checkGold || okEvents.contains(id2)) {
266 if (id2 != null) {
267 tmpClusters.put(index1, id2);
268 clusterIndexes.put(id2, index1);
269 }
270 }
271 }
272 if (index2 != null && index1 != null) {
273 if (!index1.equals(index2)) {
274 if (id2 != null) {
275 clusterIndexes.put(id2, index1);
276 tmpClusters.putAll(index1, tmpClusters.get(index2));
277 tmpClusters.removeAll(index2);
278 }
279 }
280 }
281 }
282
283 System.out.println(tmpClusters);
284
285 int otherClusterID = 10000;
286 for (final File file : Files.fileTreeTraverser().preOrderTraversal(inputNaf)) {
287 if (!file.isFile()) {
288 continue;
289 }
290 if (file.getName().startsWith(".")) {
291 continue;
292 }
293
294 String path = file.getParentFile().toString();
295 Integer folder = Integer.parseInt(path.substring(path.lastIndexOf("/")).substring(1));
296 Integer fileNum = Integer.parseInt(file.getName().substring(0, file.getName().length() - 4));
297
298 LOGGER.debug(file.getAbsolutePath());
299 KAFDocument document = KAFDocument.createFromFile(file);
300
301 if (FOLDER != null && !folder.equals(FOLDER)) {
302 continue;
303 }
304
305 writer.append(String.format("#begin document %d_%d", folder, fileNum)).append("\n");
306 Integer numSentences = document.getNumSentences();
307 for (int i = 1; i <= numSentences; i++) {
308
309 boolean useThis = false;
310 StringBuilder outBuilder = new StringBuilder();
311
312 List<Term> sentenceTerms = document.getSentenceTerms(i);
313 int n = 0;
314 for (Term token : sentenceTerms) {
315 String id = String.format("%d_%d_%d", folder, fileNum, token.getOffset());
316 if (okEvents.contains(id)) {
317 useThis = true;
318 }
319 String last;
320 if (clusterIndexes.containsKey(id)) {
321 last = String.format("(%s)", clusterIndexes.get(id));
322 } else {
323 if (okEvents.contains(id) && addSingleClusters) {
324 last = String.format("(%d)", ++otherClusterID);
325 } else {
326 last = "_";
327 }
328 }
329 printToken(outBuilder, token, ++n, last);
330 }
331
332 outBuilder.append("\n");
333
334 if (useThis) {
335 writer.append(outBuilder.toString());
336 }
337 }
338
339 }
340 writer.close();
341
342 } catch (Exception e) {
343 CommandLine.fail(e);
344
345 }
346 }
347
348 }