1   package eu.fbk.dkm.pikes.resources.ecb;
2   
3   import com.google.common.collect.HashMultimap;
4   import com.google.common.io.Files;
5   import eu.fbk.utils.core.CommandLine;
6   import ixa.kaflib.Coref;
7   import ixa.kaflib.KAFDocument;
8   import ixa.kaflib.Span;
9   import ixa.kaflib.Term;
10  import org.apache.commons.csv.CSVFormat;
11  import org.apache.commons.csv.CSVRecord;
12  import org.slf4j.Logger;
13  import org.slf4j.LoggerFactory;
14  
15  import java.io.*;
16  import java.util.HashMap;
17  import java.util.HashSet;
18  import java.util.List;
19  import java.util.Set;
20  import java.util.regex.Matcher;
21  import java.util.regex.Pattern;
22  
23  /**
24   * Created by marcorospocher on 12/03/16.
25   */
26  public class ECBevaluator {
27  
28      private static final Logger LOGGER = LoggerFactory.getLogger(ECBevaluator.class);
29      private static final Pattern tokenPattern = Pattern.compile("/([0-9]+)/([0-9])\\.ecb#char=([0-9]+)");
30      //    private static final Boolean removeAloneClusters = false;
31  //    private static final Pattern chainPattern = Pattern.compile("CHAIN=\"([0-9]+)\"");
32      private static Integer FOLDER = null;
33  
34      public static void printToken(Appendable writer, Term token, int i, String last) throws IOException {
35          writer.append(String.format("%d", i)).append("\t");
36          writer.append(token.getForm()).append("\t");
37          writer.append("_").append("\t");
38          writer.append(token.getForm()).append("\t");
39          writer.append("_").append("\t");
40          writer.append(token.getMorphofeat()).append("\t");
41          writer.append("_").append("\t");
42          writer.append("_").append("\t");
43          writer.append("_").append("\t");
44          writer.append("_").append("\t");
45          writer.append("_").append("\t");
46          writer.append("_").append("\t");
47          writer.append("_").append("\t");
48          writer.append("_").append("\t");
49          writer.append("_").append("\t");
50          writer.append("_").append("\t");
51          writer.append(last);
52          writer.append("\n");
53  
54      }
55  
56      public static void main(String[] args) {
57          try {
58  
59              final CommandLine cmd = CommandLine
60                      .parser()
61                      .withName("./ecb-evaluator")
62                      .withHeader("Evaluator event extractor")
63                      .withOption("n", "input-naf", "Input NAF folder", "FOLDER",
64                              CommandLine.Type.DIRECTORY_EXISTING, true, false, true)
65                      .withOption("i", "input-csv", "Input CSV file", "FILE",
66                              CommandLine.Type.FILE_EXISTING, true, false, true)
67                      .withOption("g", "output-gold", "Output gold file", "FILE",
68                              CommandLine.Type.FILE, true, false, true)
69                      .withOption("b", "output-baseline", "Output baseline file", "FILE",
70                              CommandLine.Type.FILE, true, false, true)
71                      .withOption("o", "output", "Output file", "FILE",
72                              CommandLine.Type.FILE, true, false, true)
73                      .withOption("l", "input-lemmas", "Lemmas CSV file", "FILE",
74                              CommandLine.Type.FILE_EXISTING, true, false, false)
75  //                    .withOption("r", "remove-alone", "Remove alone clusters")
76                      .withOption("c", "check-gold", "Use only events annotated in gold standard")
77                      .withOption("s", "add-single", "Add single clusters")
78                      .withLogger(LoggerFactory.getLogger("eu.fbk")).parse(args);
79  
80              File inputCsv = cmd.getOptionValue("input-csv", File.class);
81              File inputNaf = cmd.getOptionValue("input-naf", File.class);
82              File inputLemmas = cmd.getOptionValue("input-lemmas", File.class);
83  
84              File outputGold = cmd.getOptionValue("output-gold", File.class);
85              File outputBaseline = cmd.getOptionValue("output-baseline", File.class);
86              File output = cmd.getOptionValue("output", File.class);
87  
88  //            Boolean removeAloneClusters = cmd.hasOption("remove-alone");
89              Boolean checkGold = cmd.hasOption("check-gold");
90              Boolean addSingleClusters = cmd.hasOption("add-single");
91  
92              Reader in;
93              Iterable<CSVRecord> records;
94  
95              HashMap<String, Integer> lemmas = null;
96              int lemmaIndex = 0;
97              if (inputLemmas != null) {
98                  lemmas = new HashMap<>();
99                  in = new FileReader(inputLemmas);
100                 records = CSVFormat.EXCEL.withHeader().parse(in);
101                 for (CSVRecord record : records) {
102                     String lemma = record.get(1);
103                     lemma = lemma.replaceAll("\"", "").trim();
104                     if (lemma.length() > 0) {
105                         lemmas.put(lemma, ++lemmaIndex);
106                     }
107                 }
108             }
109 
110             LOGGER.info("Lemmas: {}", lemmas.size());
111 
112             BufferedWriter goldWriter = new BufferedWriter(new FileWriter(outputGold));
113             BufferedWriter baselineWriter = new BufferedWriter(new FileWriter(outputBaseline));
114             BufferedWriter writer = new BufferedWriter(new FileWriter(output));
115 
116             HashMultimap<String, String> goldTmpClusters = HashMultimap.create();
117             HashMap<String, String> goldClusters = new HashMap<>();
118             Set<String> okEvents = new HashSet<>();
119 
120             for (final File file : Files.fileTreeTraverser().preOrderTraversal(inputNaf)) {
121                 if (!file.isFile()) {
122                     continue;
123                 }
124                 if (file.getName().startsWith(".")) {
125                     continue;
126                 }
127 
128                 String path = file.getParentFile().toString();
129                 Integer folder = Integer.parseInt(path.substring(path.lastIndexOf("/")).substring(1));
130                 Integer fileNum = Integer.parseInt(file.getName().substring(0, file.getName().length() - 4));
131 
132                 if (FOLDER != null && !folder.equals(FOLDER)) {
133                     continue;
134                 }
135 
136                 LOGGER.debug(file.getAbsolutePath());
137                 KAFDocument document = KAFDocument.createFromFile(file);
138 
139                 for (Coref coref : document.getCorefs()) {
140                     if (coref.getType() == null) {
141                         continue;
142                     }
143                     if (!coref.getType().equals("event-gold")) {
144                         continue;
145                     }
146 
147                     Integer cluster = Integer.parseInt(coref.getCluster());
148                     String idCluster = String.valueOf(1000 * folder + cluster);
149 
150                     for (Span<Term> termSpan : coref.getSpans()) {
151                         Term term = termSpan.getTargets().get(0);
152                         String lemma = term.getLemma();
153                         if (lemmas == null || lemmas.containsKey(lemma)) {
154                             String text = folder + "_" + fileNum + "_" + term.getOffset();
155                             goldTmpClusters.put(idCluster, text);
156                             goldClusters.put(text, idCluster);
157                             okEvents.add(text);
158                         }
159                     }
160                 }
161 
162                 goldWriter.append(String.format("#begin document %d_%d", folder, fileNum)).append("\n");
163                 baselineWriter.append(String.format("#begin document %d_%d", folder, fileNum)).append("\n");
164 
165                 Integer numSentences = document.getNumSentences();
166                 for (int i = 1; i <= numSentences; i++) {
167 
168                     boolean useThis = false;
169                     StringBuilder goldBuilder = new StringBuilder();
170                     StringBuilder baselineBuilder = new StringBuilder();
171 
172                     List<Term> sentenceTerms = document.getSentenceTerms(i);
173                     int n = 0;
174                     for (Term token : sentenceTerms) {
175                         String id = String.format("%d_%d_%d", folder, fileNum, token.getOffset());
176                         String last;
177                         n++;
178 
179                         last = "_";
180                         if (goldClusters.containsKey(id)) {
181                             last = String.format("(%s)", goldClusters.get(id));
182                             useThis = true;
183                         }
184                         printToken(goldBuilder, token, n, last);
185 
186                         last = "_";
187                         String lemma = token.getLemma();
188                         if (goldClusters.containsKey(id) && lemmas.containsKey(lemma)) {
189                             last = String.format("(%d)", lemmas.get(lemma));
190                         }
191                         printToken(baselineBuilder, token, n, last);
192                     }
193 
194                     goldBuilder.append("\n");
195                     baselineBuilder.append("\n");
196 
197                     if (useThis) {
198                         goldWriter.append(goldBuilder.toString());
199                         baselineWriter.append(baselineBuilder.toString());
200                     }
201                 }
202             }
203 
204             goldWriter.close();
205             baselineWriter.close();
206 
207 //            Set<Set> goldClusters = new HashSet<>();
208 //            for (String key : goldTmpClusters.keySet()) {
209 //                Set<String> cluster = goldTmpClusters.get(key);
210 //                if (cluster.size() > 1 || !removeAloneClusters) {
211 //                    goldClusters.add(cluster);
212 //                }
213 //            }
214 
215 //            LOGGER.info("Gold clusters: {}", goldClusters.size());
216 
217             in = new FileReader(inputCsv);
218             records = CSVFormat.EXCEL.withHeader().parse(in);
219 
220             // Size must be always 4!
221             int clusterID = 0;
222             HashMap<String, Integer> clusterIndexes = new HashMap<>();
223             HashMultimap<Integer, String> tmpClusters = HashMultimap.create();
224             for (CSVRecord record : records) {
225                 Matcher matcher;
226 
227                 String id1 = null;
228                 String id2 = null;
229                 matcher = tokenPattern.matcher(record.get(1));
230                 if (matcher.find()) {
231                     id1 = matcher.group(1) + "_" + matcher.group(2) + "_" + matcher.group(3);
232                 }
233                 matcher = tokenPattern.matcher(record.get(3));
234                 if (matcher.find()) {
235                     id2 = matcher.group(1) + "_" + matcher.group(2) + "_" + matcher.group(3);
236                 }
237 
238                 Integer index1 = clusterIndexes.get(id1);
239                 Integer index2 = clusterIndexes.get(id2);
240 
241                 if (index1 == null && index2 == null) {
242                     clusterID++;
243                     if (!checkGold || okEvents.contains(id2)) {
244                         if (id2 != null) {
245                             tmpClusters.put(clusterID, id2);
246                             clusterIndexes.put(id2, clusterID);
247                         }
248                     }
249                     if (!checkGold || okEvents.contains(id1)) {
250                         if (id1 != null) {
251                             tmpClusters.put(clusterID, id1);
252                             clusterIndexes.put(id1, clusterID);
253                         }
254                     }
255                 }
256                 if (index1 == null && index2 != null) {
257                     if (!checkGold || okEvents.contains(id1)) {
258                         if (id1 != null) {
259                             tmpClusters.put(index2, id1);
260                             clusterIndexes.put(id1, index2);
261                         }
262                     }
263                 }
264                 if (index2 == null && index1 != null) {
265                     if (!checkGold || okEvents.contains(id2)) {
266                         if (id2 != null) {
267                             tmpClusters.put(index1, id2);
268                             clusterIndexes.put(id2, index1);
269                         }
270                     }
271                 }
272                 if (index2 != null && index1 != null) {
273                     if (!index1.equals(index2)) {
274                         if (id2 != null) {
275                             clusterIndexes.put(id2, index1);
276                             tmpClusters.putAll(index1, tmpClusters.get(index2));
277                             tmpClusters.removeAll(index2);
278                         }
279                     }
280                 }
281             }
282 
283             System.out.println(tmpClusters);
284 
285             int otherClusterID = 10000;
286             for (final File file : Files.fileTreeTraverser().preOrderTraversal(inputNaf)) {
287                 if (!file.isFile()) {
288                     continue;
289                 }
290                 if (file.getName().startsWith(".")) {
291                     continue;
292                 }
293 
294                 String path = file.getParentFile().toString();
295                 Integer folder = Integer.parseInt(path.substring(path.lastIndexOf("/")).substring(1));
296                 Integer fileNum = Integer.parseInt(file.getName().substring(0, file.getName().length() - 4));
297 
298                 LOGGER.debug(file.getAbsolutePath());
299                 KAFDocument document = KAFDocument.createFromFile(file);
300 
301                 if (FOLDER != null && !folder.equals(FOLDER)) {
302                     continue;
303                 }
304 
305                 writer.append(String.format("#begin document %d_%d", folder, fileNum)).append("\n");
306                 Integer numSentences = document.getNumSentences();
307                 for (int i = 1; i <= numSentences; i++) {
308 
309                     boolean useThis = false;
310                     StringBuilder outBuilder = new StringBuilder();
311 
312                     List<Term> sentenceTerms = document.getSentenceTerms(i);
313                     int n = 0;
314                     for (Term token : sentenceTerms) {
315                         String id = String.format("%d_%d_%d", folder, fileNum, token.getOffset());
316                         if (okEvents.contains(id)) {
317                             useThis = true;
318                         }
319                         String last;
320                         if (clusterIndexes.containsKey(id)) {
321                             last = String.format("(%s)", clusterIndexes.get(id));
322                         } else {
323                             if (okEvents.contains(id) && addSingleClusters) {
324                                 last = String.format("(%d)", ++otherClusterID);
325                             } else {
326                                 last = "_";
327                             }
328                         }
329                         printToken(outBuilder, token, ++n, last);
330                     }
331 
332                     outBuilder.append("\n");
333 
334                     if (useThis) {
335                         writer.append(outBuilder.toString());
336                     }
337                 }
338 
339             }
340             writer.close();
341 
342         } catch (Exception e) {
343             CommandLine.fail(e);
344 
345         }
346     }
347 
348 }