1   package eu.fbk.dkm.pikes.resources.ecb;
2   
3   import com.google.common.collect.HashMultimap;
4   import com.google.common.io.Files;
5   import eu.fbk.utils.core.CommandLine;
6   import ixa.kaflib.Coref;
7   import ixa.kaflib.KAFDocument;
8   import ixa.kaflib.Span;
9   import ixa.kaflib.Term;
10  import org.apache.commons.csv.CSVFormat;
11  import org.apache.commons.csv.CSVRecord;
12  import org.slf4j.Logger;
13  import org.slf4j.LoggerFactory;
14  
15  import java.io.*;
16  import java.util.*;
17  import java.util.regex.Matcher;
18  import java.util.regex.Pattern;
19  
20  /**
21   * Created by marcorospocher on 12/03/16.
22   */
23  @SuppressWarnings("Duplicates") public class ECBPlusEvaluatorLemma {
24  
25      private static final Logger LOGGER = LoggerFactory.getLogger(ECBPlusEvaluatorLemma.class);
26      private static final Pattern tokenPattern = Pattern.compile(".*/([0-9]+)_([0-9]+ecb[a-z]*)\\.xml#char=([0-9]+).*");
27      private static final Pattern fileNamePattern = Pattern.compile("[0-9]+/([0-9]+)_([0-9a-zA-Z]+)");
28  
29      //    private static final Boolean removeAloneClusters = false;
30  //    private static final Pattern chainPattern = Pattern.compile("CHAIN=\"([0-9]+)\"");
31      private static Integer FOLDER = null;
32  
33      public static void printToken(Appendable writer, Term token, int i, String last) throws IOException {
34          writer.append(String.format("%d", i)).append("\t");
35          writer.append(token.getForm()).append("\t");
36          writer.append("_").append("\t");
37          writer.append(token.getForm()).append("\t");
38          writer.append("_").append("\t");
39          writer.append(token.getMorphofeat()).append("\t");
40          writer.append("_").append("\t");
41          writer.append("_").append("\t");
42          writer.append("_").append("\t");
43          writer.append("_").append("\t");
44          writer.append("_").append("\t");
45          writer.append("_").append("\t");
46          writer.append("_").append("\t");
47          writer.append("_").append("\t");
48          writer.append("_").append("\t");
49          writer.append("_").append("\t");
50          writer.append(last);
51          writer.append("\n");
52  
53      }
54  
55      public static void main(String[] args) {
56          try {
57  
58              final CommandLine cmd = CommandLine
59                      .parser()
60                      .withName("./ecb-evaluator")
61                      .withHeader("Evaluator event extractor")
62                      .withOption("n", "input-naf", "Input NAF folder", "FOLDER",
63                              CommandLine.Type.DIRECTORY_EXISTING, true, false, true)
64                      .withOption("i", "input-csv", "Input CSV file", "FILE",
65                              CommandLine.Type.FILE_EXISTING, true, false, true)
66                      .withOption("g", "output-gold", "Output gold file", "FILE",
67                              CommandLine.Type.FILE, true, false, true)
68                      .withOption("b", "output-baseline", "Output baseline file", "FILE",
69                              CommandLine.Type.FILE, true, false, true)
70                      .withOption("o", "output", "Output file", "FILE",
71                              CommandLine.Type.FILE, true, false, true)
72                      .withOption("l", "input-lemmas", "Lemmas CSV file", "FILE",
73                              CommandLine.Type.FILE_EXISTING, true, false, false)
74                      .withOption("a", "input-all-lemmas", "Lemmas CSV file", "FILE",
75                              CommandLine.Type.FILE_EXISTING, true, false, false)
76  //                    .withOption("r", "remove-alone", "Remove alone clusters")
77                      .withOption("c", "check-gold", "Use only events annotated in gold standard")
78                      .withOption("s", "add-single", "Add single clusters")
79                      .withLogger(LoggerFactory.getLogger("eu.fbk")).parse(args);
80  
81              File inputCsv = cmd.getOptionValue("input-csv", File.class);
82              File inputNaf = cmd.getOptionValue("input-naf", File.class);
83              File inputLemmas = cmd.getOptionValue("input-lemmas", File.class);
84              File inputAllLemmas = cmd.getOptionValue("input-all-lemmas", File.class);
85  
86              File outputGold = cmd.getOptionValue("output-gold", File.class);
87              File outputBaseline = cmd.getOptionValue("output-baseline", File.class);
88              File output = cmd.getOptionValue("output", File.class);
89  
90  //            Boolean removeAloneClusters = cmd.hasOption("remove-alone");
91              Boolean checkGold = cmd.hasOption("check-gold");
92              Boolean addSingleClusters = cmd.hasOption("add-single");
93  
94              Reader in;
95              Iterable<CSVRecord> records;
96  
97              HashMap<String, Integer> lemmas = null;
98              HashMap<String, Integer> allLemmas = null;
99  
100             int lemmaIndex = 0;
101             if (inputLemmas != null) {
102                 lemmas = new HashMap<>();
103                 in = new FileReader(inputLemmas);
104                 records = CSVFormat.EXCEL.withHeader().parse(in);
105                 for (CSVRecord record : records) {
106                     String lemma = record.get(1);
107                     lemma = lemma.replaceAll("\"", "").trim();
108                     if (lemma.length() > 0) {
109                         lemmas.put(lemma, ++lemmaIndex);
110                     }
111                 }
112             }
113             lemmaIndex = 0;
114             if (inputAllLemmas != null) {
115                 allLemmas = new HashMap<>();
116                 in = new FileReader(inputAllLemmas);
117                 records = CSVFormat.EXCEL.withHeader().parse(in);
118                 for (CSVRecord record : records) {
119                     String lemma = record.get(1);
120                     lemma = lemma.replaceAll("\"", "").trim();
121                     if (lemma.length() > 0) {
122                         allLemmas.put(lemma, ++lemmaIndex);
123                     }
124                 }
125             }
126 
127             if (lemmas != null) {
128                 LOGGER.info("Lemmas: {}", lemmas.size());
129             }
130             if (allLemmas != null) {
131                 LOGGER.info("All-lemmas: {}", allLemmas.size());
132             }
133 
134             BufferedWriter goldWriter = new BufferedWriter(new FileWriter(outputGold));
135             BufferedWriter baselineWriter = new BufferedWriter(new FileWriter(outputBaseline));
136             BufferedWriter writer = new BufferedWriter(new FileWriter(output));
137 
138             HashMultimap<String, String> goldTmpClusters = HashMultimap.create();
139             HashMap<String, String> goldClusters = new HashMap<>();
140             Set<String> okEvents = new HashSet<>();
141 
142             Map<String, String> theBaseline = new HashMap<>();
143 
144             for (final File file : Files.fileTreeTraverser().preOrderTraversal(inputNaf)) {
145                 if (!file.isFile()) {
146                     continue;
147                 }
148                 if (file.getName().startsWith(".")) {
149                     continue;
150                 }
151 
152                 String path = file.getParentFile().toString();
153                 String relativeFilePath = file.getAbsolutePath()
154                         .substring(inputNaf.getAbsolutePath().length());
155 
156                 Matcher matcher = fileNamePattern.matcher(relativeFilePath);
157                 Integer folder = null;
158                 String fileNum = null;
159                 if (matcher.find()) {
160                     folder = Integer.parseInt(matcher.group(1));
161                     fileNum = matcher.group(2);
162 
163                 } else {
164                     LOGGER.error("Error in file name: {}", relativeFilePath);
165                     System.exit(1);
166                 }
167 
168                 if (FOLDER != null && !folder.equals(FOLDER)) {
169                     continue;
170                 }
171 
172                 LOGGER.debug(file.getAbsolutePath());
173                 KAFDocument document = KAFDocument.createFromFile(file);
174 
175                 for (Coref coref : document.getCorefs()) {
176                     if (coref.getType() == null) {
177                         continue;
178                     }
179                     if (!coref.getType().equals("event-gold")) {
180                         continue;
181                     }
182 
183                     Integer cluster = Integer.parseInt(coref.getCluster());
184                     String idCluster = String.valueOf(1000 * folder + cluster);
185 
186                     for (Span<Term> termSpan : coref.getSpans()) {
187                         Term term = termSpan.getTargets().get(0);
188                         String lemma = term.getLemma();
189 
190                         boolean add = false;
191                         if (allLemmas != null && allLemmas.containsKey(lemma)) {
192                             add = true;
193                         }
194                         if (lemmas == null || lemmas.containsKey(lemma)) {
195                             add = true;
196                         }
197 
198                         if (add) {
199                             String text = folder + "_" + fileNum + "_" + term.getOffset();
200                             goldTmpClusters.put(idCluster, text);
201                             goldClusters.put(text, idCluster);
202                             okEvents.add(text);
203                         }
204                     }
205                 }
206 
207                 goldWriter.append(String.format("#begin document %d_%s", folder, fileNum)).append("\n");
208                 baselineWriter.append(String.format("#begin document %d_%s", folder, fileNum)).append("\n");
209 
210                 Integer numSentences = document.getNumSentences();
211                 for (int i = 1; i <= numSentences; i++) {
212 
213                     boolean useThis = false;
214                     StringBuilder goldBuilder = new StringBuilder();
215                     StringBuilder baselineBuilder = new StringBuilder();
216 
217                     List<Term> sentenceTerms = document.getSentenceTerms(i);
218                     int n = 0;
219                     for (Term token : sentenceTerms) {
220                         String id = String.format("%d_%s_%d", folder, fileNum, token.getOffset());
221                         String last;
222                         n++;
223 
224                         last = "_";
225                         if (goldClusters.containsKey(id)) {
226                             last = String.format("(%s)", goldClusters.get(id));
227                             useThis = true;
228                         }
229                         printToken(goldBuilder, token, n, last);
230 
231                         last = "_";
232                         String lemma = token.getLemma();
233                         if (lemmas != null) {
234                             if (goldClusters.containsKey(id) && lemmas.containsKey(lemma)) {
235                                 last = String.format("(%d)", lemmas.get(lemma));
236                             }
237                         }
238                         if (allLemmas != null) {
239                             if (goldClusters.containsKey(id) && allLemmas.containsKey(lemma)) {
240                                 last = String.format("(%d)", allLemmas.get(lemma));
241                             }
242                         }
243                         if (!last.equals("_")) {
244                             theBaseline.put(id, last);
245                         }
246                         printToken(baselineBuilder, token, n, last);
247                     }
248 
249                     goldBuilder.append("\n");
250                     baselineBuilder.append("\n");
251 
252                     if (useThis) {
253                         goldWriter.append(goldBuilder.toString());
254                         baselineWriter.append(baselineBuilder.toString());
255                     }
256                 }
257 //                break;
258             }
259 
260             goldWriter.close();
261             baselineWriter.close();
262 
263 //            Set<Set> goldClusters = new HashSet<>();
264 //            for (String key : goldTmpClusters.keySet()) {
265 //                Set<String> cluster = goldTmpClusters.get(key);
266 //                if (cluster.size() > 1 || !removeAloneClusters) {
267 //                    goldClusters.add(cluster);
268 //                }
269 //            }
270 
271 //            LOGGER.info("Gold clusters: {}", goldClusters.size());
272 
273             in = new FileReader(inputCsv);
274             records = CSVFormat.EXCEL.withHeader().parse(in);
275 
276             // Size must be always 4!
277             int clusterID = 0;
278             HashMap<String, Integer> clusterIndexes = new HashMap<>();
279             HashMultimap<Integer, String> theClusters = HashMultimap.create();
280             for (CSVRecord record : records) {
281                 Matcher matcher;
282 
283                 String id1 = null;
284                 String id2 = null;
285                 matcher = tokenPattern.matcher(record.get(1));
286                 if (matcher.find()) {
287                     id1 = matcher.group(1) + "_" + matcher.group(2) + "_" + matcher.group(3);
288                 }
289                 matcher = tokenPattern.matcher(record.get(3));
290                 if (matcher.find()) {
291                     id2 = matcher.group(1) + "_" + matcher.group(2) + "_" + matcher.group(3);
292                 }
293 
294 //                System.out.println(id1);
295 //                System.out.println(id2);
296 
297                 Integer index1 = clusterIndexes.get(id1);
298                 Integer index2 = clusterIndexes.get(id2);
299 
300 //                System.out.println(index1);
301 //                System.out.println(index2);
302 
303                 if (index1 == null && index2 == null) {
304                     clusterID++;
305                     if (!checkGold || okEvents.contains(id2)) {
306                         if (id2 != null) {
307                             theClusters.put(clusterID, id2);
308                             clusterIndexes.put(id2, clusterID);
309                         }
310                     }
311                     if (!checkGold || okEvents.contains(id1)) {
312                         if (id1 != null) {
313                             theClusters.put(clusterID, id1);
314                             clusterIndexes.put(id1, clusterID);
315                         }
316                     }
317                 }
318                 if (index1 == null && index2 != null) {
319                     if (!checkGold || okEvents.contains(id1)) {
320                         if (id1 != null) {
321                             theClusters.put(index2, id1);
322                             clusterIndexes.put(id1, index2);
323                         }
324                     }
325                 }
326                 if (index2 == null && index1 != null) {
327                     if (!checkGold || okEvents.contains(id2)) {
328                         if (id2 != null) {
329                             theClusters.put(index1, id2);
330                             clusterIndexes.put(id2, index1);
331                         }
332                     }
333                 }
334                 if (index2 != null && index1 != null) {
335                     if (!index1.equals(index2)) {
336                         if (id2 != null) {
337                             clusterIndexes.put(id2, index1);
338                             theClusters.putAll(index1, theClusters.get(index2));
339                             theClusters.removeAll(index2);
340                         }
341                     }
342                 }
343             }
344 
345 //            System.out.println(theClusters);
346 //            System.out.println(theBaseline);
347 
348             int otherClusterID = 100000;
349             for (final File file : Files.fileTreeTraverser().preOrderTraversal(inputNaf)) {
350                 if (!file.isFile()) {
351                     continue;
352                 }
353                 if (file.getName().startsWith(".")) {
354                     continue;
355                 }
356 
357 //                String path = file.getParentFile().toString();
358                 String relativeFilePath = file.getAbsolutePath()
359                         .substring(inputNaf.getAbsolutePath().length());
360 
361                 Matcher matcher = fileNamePattern.matcher(relativeFilePath);
362                 Integer folder = null;
363                 String fileNum = null;
364                 if (matcher.find()) {
365                     folder = Integer.parseInt(matcher.group(1));
366                     fileNum = matcher.group(2);
367 
368                 } else {
369                     LOGGER.error("Error in file name: {}", relativeFilePath);
370                     System.exit(1);
371                 }
372 //                Integer folder = Integer.parseInt(path.substring(path.lastIndexOf("/")).substring(1));
373 //                Integer fileNum = Integer.parseInt(file.getName().substring(0, file.getName().length() - 4));
374 
375                 LOGGER.debug(file.getAbsolutePath());
376                 KAFDocument document = KAFDocument.createFromFile(file);
377 
378                 if (FOLDER != null && !folder.equals(FOLDER)) {
379                     continue;
380                 }
381 
382                 writer.append(String.format("#begin document %d_%s", folder, fileNum)).append("\n");
383                 Integer numSentences = document.getNumSentences();
384                 for (int i = 1; i <= numSentences; i++) {
385 
386                     boolean useThis = false;
387                     StringBuilder outBuilder = new StringBuilder();
388 
389                     List<Term> sentenceTerms = document.getSentenceTerms(i);
390                     int n = 0;
391                     for (Term token : sentenceTerms) {
392                         String id = String.format("%d_%s_%d", folder, fileNum, token.getOffset());
393                         if (okEvents.contains(id)) {
394                             useThis = true;
395                         }
396                         String last = theBaseline.getOrDefault(id, "_");
397                         if (clusterIndexes.containsKey(id)) {
398                             last = String.format("(%d)", clusterIndexes.get(id) + 1000000);
399                         }
400                         if (last.equals("_")) {
401                             if (okEvents.contains(id) && addSingleClusters) {
402                                 last = String.format("(%d)", ++otherClusterID);
403                             }
404                         }
405                         printToken(outBuilder, token, ++n, last);
406                     }
407 
408                     outBuilder.append("\n");
409 
410                     if (useThis) {
411                         writer.append(outBuilder.toString());
412                     }
413                 }
414 
415             }
416             writer.close();
417 
418         } catch (Exception e) {
419             CommandLine.fail(e);
420 
421         }
422     }
423 
424 }