1 package eu.fbk.dkm.pikes.resources.ecb;
2
3 import com.google.common.collect.HashMultimap;
4 import com.google.common.io.Files;
5 import eu.fbk.utils.core.CommandLine;
6 import ixa.kaflib.Coref;
7 import ixa.kaflib.KAFDocument;
8 import ixa.kaflib.Span;
9 import ixa.kaflib.Term;
10 import org.apache.commons.csv.CSVFormat;
11 import org.apache.commons.csv.CSVRecord;
12 import org.slf4j.Logger;
13 import org.slf4j.LoggerFactory;
14
15 import java.io.*;
16 import java.util.*;
17 import java.util.regex.Matcher;
18 import java.util.regex.Pattern;
19
20
21
22
23 @SuppressWarnings("Duplicates") public class ECBPlusEvaluatorLemma {
24
25 private static final Logger LOGGER = LoggerFactory.getLogger(ECBPlusEvaluatorLemma.class);
26 private static final Pattern tokenPattern = Pattern.compile(".*/([0-9]+)_([0-9]+ecb[a-z]*)\\.xml#char=([0-9]+).*");
27 private static final Pattern fileNamePattern = Pattern.compile("[0-9]+/([0-9]+)_([0-9a-zA-Z]+)");
28
29
30
31 private static Integer FOLDER = null;
32
33 public static void printToken(Appendable writer, Term token, int i, String last) throws IOException {
34 writer.append(String.format("%d", i)).append("\t");
35 writer.append(token.getForm()).append("\t");
36 writer.append("_").append("\t");
37 writer.append(token.getForm()).append("\t");
38 writer.append("_").append("\t");
39 writer.append(token.getMorphofeat()).append("\t");
40 writer.append("_").append("\t");
41 writer.append("_").append("\t");
42 writer.append("_").append("\t");
43 writer.append("_").append("\t");
44 writer.append("_").append("\t");
45 writer.append("_").append("\t");
46 writer.append("_").append("\t");
47 writer.append("_").append("\t");
48 writer.append("_").append("\t");
49 writer.append("_").append("\t");
50 writer.append(last);
51 writer.append("\n");
52
53 }
54
55 public static void main(String[] args) {
56 try {
57
58 final CommandLine cmd = CommandLine
59 .parser()
60 .withName("./ecb-evaluator")
61 .withHeader("Evaluator event extractor")
62 .withOption("n", "input-naf", "Input NAF folder", "FOLDER",
63 CommandLine.Type.DIRECTORY_EXISTING, true, false, true)
64 .withOption("i", "input-csv", "Input CSV file", "FILE",
65 CommandLine.Type.FILE_EXISTING, true, false, true)
66 .withOption("g", "output-gold", "Output gold file", "FILE",
67 CommandLine.Type.FILE, true, false, true)
68 .withOption("b", "output-baseline", "Output baseline file", "FILE",
69 CommandLine.Type.FILE, true, false, true)
70 .withOption("o", "output", "Output file", "FILE",
71 CommandLine.Type.FILE, true, false, true)
72 .withOption("l", "input-lemmas", "Lemmas CSV file", "FILE",
73 CommandLine.Type.FILE_EXISTING, true, false, false)
74 .withOption("a", "input-all-lemmas", "Lemmas CSV file", "FILE",
75 CommandLine.Type.FILE_EXISTING, true, false, false)
76
77 .withOption("c", "check-gold", "Use only events annotated in gold standard")
78 .withOption("s", "add-single", "Add single clusters")
79 .withLogger(LoggerFactory.getLogger("eu.fbk")).parse(args);
80
81 File inputCsv = cmd.getOptionValue("input-csv", File.class);
82 File inputNaf = cmd.getOptionValue("input-naf", File.class);
83 File inputLemmas = cmd.getOptionValue("input-lemmas", File.class);
84 File inputAllLemmas = cmd.getOptionValue("input-all-lemmas", File.class);
85
86 File outputGold = cmd.getOptionValue("output-gold", File.class);
87 File outputBaseline = cmd.getOptionValue("output-baseline", File.class);
88 File output = cmd.getOptionValue("output", File.class);
89
90
91 Boolean checkGold = cmd.hasOption("check-gold");
92 Boolean addSingleClusters = cmd.hasOption("add-single");
93
94 Reader in;
95 Iterable<CSVRecord> records;
96
97 HashMap<String, Integer> lemmas = null;
98 HashMap<String, Integer> allLemmas = null;
99
100 int lemmaIndex = 0;
101 if (inputLemmas != null) {
102 lemmas = new HashMap<>();
103 in = new FileReader(inputLemmas);
104 records = CSVFormat.EXCEL.withHeader().parse(in);
105 for (CSVRecord record : records) {
106 String lemma = record.get(1);
107 lemma = lemma.replaceAll("\"", "").trim();
108 if (lemma.length() > 0) {
109 lemmas.put(lemma, ++lemmaIndex);
110 }
111 }
112 }
113 lemmaIndex = 0;
114 if (inputAllLemmas != null) {
115 allLemmas = new HashMap<>();
116 in = new FileReader(inputAllLemmas);
117 records = CSVFormat.EXCEL.withHeader().parse(in);
118 for (CSVRecord record : records) {
119 String lemma = record.get(1);
120 lemma = lemma.replaceAll("\"", "").trim();
121 if (lemma.length() > 0) {
122 allLemmas.put(lemma, ++lemmaIndex);
123 }
124 }
125 }
126
127 if (lemmas != null) {
128 LOGGER.info("Lemmas: {}", lemmas.size());
129 }
130 if (allLemmas != null) {
131 LOGGER.info("All-lemmas: {}", allLemmas.size());
132 }
133
134 BufferedWriter goldWriter = new BufferedWriter(new FileWriter(outputGold));
135 BufferedWriter baselineWriter = new BufferedWriter(new FileWriter(outputBaseline));
136 BufferedWriter writer = new BufferedWriter(new FileWriter(output));
137
138 HashMultimap<String, String> goldTmpClusters = HashMultimap.create();
139 HashMap<String, String> goldClusters = new HashMap<>();
140 Set<String> okEvents = new HashSet<>();
141
142 Map<String, String> theBaseline = new HashMap<>();
143
144 for (final File file : Files.fileTreeTraverser().preOrderTraversal(inputNaf)) {
145 if (!file.isFile()) {
146 continue;
147 }
148 if (file.getName().startsWith(".")) {
149 continue;
150 }
151
152 String path = file.getParentFile().toString();
153 String relativeFilePath = file.getAbsolutePath()
154 .substring(inputNaf.getAbsolutePath().length());
155
156 Matcher matcher = fileNamePattern.matcher(relativeFilePath);
157 Integer folder = null;
158 String fileNum = null;
159 if (matcher.find()) {
160 folder = Integer.parseInt(matcher.group(1));
161 fileNum = matcher.group(2);
162
163 } else {
164 LOGGER.error("Error in file name: {}", relativeFilePath);
165 System.exit(1);
166 }
167
168 if (FOLDER != null && !folder.equals(FOLDER)) {
169 continue;
170 }
171
172 LOGGER.debug(file.getAbsolutePath());
173 KAFDocument document = KAFDocument.createFromFile(file);
174
175 for (Coref coref : document.getCorefs()) {
176 if (coref.getType() == null) {
177 continue;
178 }
179 if (!coref.getType().equals("event-gold")) {
180 continue;
181 }
182
183 Integer cluster = Integer.parseInt(coref.getCluster());
184 String idCluster = String.valueOf(1000 * folder + cluster);
185
186 for (Span<Term> termSpan : coref.getSpans()) {
187 Term term = termSpan.getTargets().get(0);
188 String lemma = term.getLemma();
189
190 boolean add = false;
191 if (allLemmas != null && allLemmas.containsKey(lemma)) {
192 add = true;
193 }
194 if (lemmas == null || lemmas.containsKey(lemma)) {
195 add = true;
196 }
197
198 if (add) {
199 String text = folder + "_" + fileNum + "_" + term.getOffset();
200 goldTmpClusters.put(idCluster, text);
201 goldClusters.put(text, idCluster);
202 okEvents.add(text);
203 }
204 }
205 }
206
207 goldWriter.append(String.format("#begin document %d_%s", folder, fileNum)).append("\n");
208 baselineWriter.append(String.format("#begin document %d_%s", folder, fileNum)).append("\n");
209
210 Integer numSentences = document.getNumSentences();
211 for (int i = 1; i <= numSentences; i++) {
212
213 boolean useThis = false;
214 StringBuilder goldBuilder = new StringBuilder();
215 StringBuilder baselineBuilder = new StringBuilder();
216
217 List<Term> sentenceTerms = document.getSentenceTerms(i);
218 int n = 0;
219 for (Term token : sentenceTerms) {
220 String id = String.format("%d_%s_%d", folder, fileNum, token.getOffset());
221 String last;
222 n++;
223
224 last = "_";
225 if (goldClusters.containsKey(id)) {
226 last = String.format("(%s)", goldClusters.get(id));
227 useThis = true;
228 }
229 printToken(goldBuilder, token, n, last);
230
231 last = "_";
232 String lemma = token.getLemma();
233 if (lemmas != null) {
234 if (goldClusters.containsKey(id) && lemmas.containsKey(lemma)) {
235 last = String.format("(%d)", lemmas.get(lemma));
236 }
237 }
238 if (allLemmas != null) {
239 if (goldClusters.containsKey(id) && allLemmas.containsKey(lemma)) {
240 last = String.format("(%d)", allLemmas.get(lemma));
241 }
242 }
243 if (!last.equals("_")) {
244 theBaseline.put(id, last);
245 }
246 printToken(baselineBuilder, token, n, last);
247 }
248
249 goldBuilder.append("\n");
250 baselineBuilder.append("\n");
251
252 if (useThis) {
253 goldWriter.append(goldBuilder.toString());
254 baselineWriter.append(baselineBuilder.toString());
255 }
256 }
257
258 }
259
260 goldWriter.close();
261 baselineWriter.close();
262
263
264
265
266
267
268
269
270
271
272
273 in = new FileReader(inputCsv);
274 records = CSVFormat.EXCEL.withHeader().parse(in);
275
276
277 int clusterID = 0;
278 HashMap<String, Integer> clusterIndexes = new HashMap<>();
279 HashMultimap<Integer, String> theClusters = HashMultimap.create();
280 for (CSVRecord record : records) {
281 Matcher matcher;
282
283 String id1 = null;
284 String id2 = null;
285 matcher = tokenPattern.matcher(record.get(1));
286 if (matcher.find()) {
287 id1 = matcher.group(1) + "_" + matcher.group(2) + "_" + matcher.group(3);
288 }
289 matcher = tokenPattern.matcher(record.get(3));
290 if (matcher.find()) {
291 id2 = matcher.group(1) + "_" + matcher.group(2) + "_" + matcher.group(3);
292 }
293
294
295
296
297 Integer index1 = clusterIndexes.get(id1);
298 Integer index2 = clusterIndexes.get(id2);
299
300
301
302
303 if (index1 == null && index2 == null) {
304 clusterID++;
305 if (!checkGold || okEvents.contains(id2)) {
306 if (id2 != null) {
307 theClusters.put(clusterID, id2);
308 clusterIndexes.put(id2, clusterID);
309 }
310 }
311 if (!checkGold || okEvents.contains(id1)) {
312 if (id1 != null) {
313 theClusters.put(clusterID, id1);
314 clusterIndexes.put(id1, clusterID);
315 }
316 }
317 }
318 if (index1 == null && index2 != null) {
319 if (!checkGold || okEvents.contains(id1)) {
320 if (id1 != null) {
321 theClusters.put(index2, id1);
322 clusterIndexes.put(id1, index2);
323 }
324 }
325 }
326 if (index2 == null && index1 != null) {
327 if (!checkGold || okEvents.contains(id2)) {
328 if (id2 != null) {
329 theClusters.put(index1, id2);
330 clusterIndexes.put(id2, index1);
331 }
332 }
333 }
334 if (index2 != null && index1 != null) {
335 if (!index1.equals(index2)) {
336 if (id2 != null) {
337 clusterIndexes.put(id2, index1);
338 theClusters.putAll(index1, theClusters.get(index2));
339 theClusters.removeAll(index2);
340 }
341 }
342 }
343 }
344
345
346
347
348 int otherClusterID = 100000;
349 for (final File file : Files.fileTreeTraverser().preOrderTraversal(inputNaf)) {
350 if (!file.isFile()) {
351 continue;
352 }
353 if (file.getName().startsWith(".")) {
354 continue;
355 }
356
357
358 String relativeFilePath = file.getAbsolutePath()
359 .substring(inputNaf.getAbsolutePath().length());
360
361 Matcher matcher = fileNamePattern.matcher(relativeFilePath);
362 Integer folder = null;
363 String fileNum = null;
364 if (matcher.find()) {
365 folder = Integer.parseInt(matcher.group(1));
366 fileNum = matcher.group(2);
367
368 } else {
369 LOGGER.error("Error in file name: {}", relativeFilePath);
370 System.exit(1);
371 }
372
373
374
375 LOGGER.debug(file.getAbsolutePath());
376 KAFDocument document = KAFDocument.createFromFile(file);
377
378 if (FOLDER != null && !folder.equals(FOLDER)) {
379 continue;
380 }
381
382 writer.append(String.format("#begin document %d_%s", folder, fileNum)).append("\n");
383 Integer numSentences = document.getNumSentences();
384 for (int i = 1; i <= numSentences; i++) {
385
386 boolean useThis = false;
387 StringBuilder outBuilder = new StringBuilder();
388
389 List<Term> sentenceTerms = document.getSentenceTerms(i);
390 int n = 0;
391 for (Term token : sentenceTerms) {
392 String id = String.format("%d_%s_%d", folder, fileNum, token.getOffset());
393 if (okEvents.contains(id)) {
394 useThis = true;
395 }
396 String last = theBaseline.getOrDefault(id, "_");
397 if (clusterIndexes.containsKey(id)) {
398 last = String.format("(%d)", clusterIndexes.get(id) + 1000000);
399 }
400 if (last.equals("_")) {
401 if (okEvents.contains(id) && addSingleClusters) {
402 last = String.format("(%d)", ++otherClusterID);
403 }
404 }
405 printToken(outBuilder, token, ++n, last);
406 }
407
408 outBuilder.append("\n");
409
410 if (useThis) {
411 writer.append(outBuilder.toString());
412 }
413 }
414
415 }
416 writer.close();
417
418 } catch (Exception e) {
419 CommandLine.fail(e);
420
421 }
422 }
423
424 }