1   package eu.fbk.dkm.pikes.resources.ecb;
2   
3   import com.google.common.collect.HashMultimap;
4   import com.google.common.io.Files;
5   import eu.fbk.utils.core.CommandLine;
6   import ixa.kaflib.*;
7   import org.apache.commons.io.FileUtils;
8   import org.slf4j.Logger;
9   import org.slf4j.LoggerFactory;
10  
11  import javax.xml.parsers.DocumentBuilder;
12  import javax.xml.parsers.DocumentBuilderFactory;
13  import java.io.BufferedReader;
14  import java.io.File;
15  import java.io.FileInputStream;
16  import java.io.InputStreamReader;
17  import java.util.*;
18  import java.util.regex.Matcher;
19  import java.util.regex.Pattern;
20  import java.util.zip.GZIPInputStream;
21  
22  /**
23   * Created by marcorospocher on 12/03/16.
24   */
25  public class ECBmerger {
26  
27      private static final Logger LOGGER = LoggerFactory.getLogger(ECBmerger.class);
28      private static final Pattern mentionPattern = Pattern.compile("<([^>]*)>");
29      private static final Pattern chainPattern = Pattern.compile("CHAIN=\"([0-9]+)\"");
30  
31      public static void main(String[] args) {
32          try {
33              final CommandLine cmd = CommandLine
34                      .parser()
35                      .withName("./taol-extractor")
36                      .withHeader("Add mentions to NAF folder for ECB resource")
37                      .withOption("i", "input-txt", "Input TXT folder", "FOLDER",
38                              CommandLine.Type.DIRECTORY_EXISTING, true, false, true)
39                      .withOption("n", "input-naf", "Input NAF folder", "FOLDER",
40                              CommandLine.Type.DIRECTORY_EXISTING, true, false, true)
41                      .withOption("o", "output", "Output folder", "FOLDER",
42                              CommandLine.Type.DIRECTORY, true, false, true)
43                      .withLogger(LoggerFactory.getLogger("eu.fbk")).parse(args);
44  
45              File inputFolder = cmd.getOptionValue("input-txt", File.class);
46              File nafFolder = cmd.getOptionValue("input-naf", File.class);
47              File outputFolder = cmd.getOptionValue("output", File.class);
48  
49              if (!outputFolder.exists()) {
50                  outputFolder.mkdirs();
51              }
52  
53              DocumentBuilderFactory dbFactory = DocumentBuilderFactory.newInstance();
54              DocumentBuilder dBuilder = dbFactory.newDocumentBuilder();
55  
56              // uncomment to get the manual mention spans
57              //Pattern MY_PATTERN = Pattern.compile("\\\">[^<]*</MENTION>");
58  
59              String tags;
60  
61              int i = 0;
62              for (final File file : Files.fileTreeTraverser().preOrderTraversal(inputFolder)) {
63                  if (!file.isFile()) {
64                      continue;
65                  }
66                  if (file.getName().startsWith(".")) {
67                      continue;
68                  }
69  
70                  String path = file.getParentFile().toString();
71                  String folder = path.substring(path.lastIndexOf("/"));
72                  String local_name = folder + File.separator + file.getName();
73  
74                  String naf = nafFolder + File.separator + folder + File.separator + file.getName()
75                          .replace("ecb.txt", "naf.gz");
76  
77                  FileInputStream bais = new FileInputStream(naf);
78                  GZIPInputStream gzis = new GZIPInputStream(bais);
79                  InputStreamReader reader = new InputStreamReader(gzis);
80                  BufferedReader in = new BufferedReader(reader);
81  
82                  KAFDocument nafDocument = KAFDocument.createFromStream(in);
83  
84  //                Map<Integer, Predicate> predicateHashMap = new HashMap<>();
85  //                for (Predicate predicate : nafDocument.getPredicates()) {
86  //                    for (Term term : predicate.getTerms()) {
87  //                        predicateHashMap.put(term.getOffset(), predicate);
88  //                    }
89  //                }
90  
91                  Map<Integer, Term> termsHashMap = new HashMap<>();
92                  for (Term term : nafDocument.getTerms()) {
93                      termsHashMap.put(term.getOffset(), term);
94                  }
95  
96                  HashMultimap<String, Integer> clusterOffsets = HashMultimap.create();
97  
98                  String content = FileUtils.readFileToString(file, "utf-8");
99  
100                 Matcher matcher = mentionPattern.matcher(content);
101                 int offset = 0;
102                 int lastEnd = 0;
103                 while (matcher.find()) {
104                     offset += matcher.start() - lastEnd;
105                     if (!matcher.group().startsWith("</")) {
106                         Matcher chainMatcher = chainPattern.matcher(matcher.group());
107                         if (!chainMatcher.find()) {
108                             LOGGER.error("No chain found!");
109                             continue;
110                         }
111 
112                         String chain = chainMatcher.group(1);
113                         clusterOffsets.put(chain, offset);
114                     }
115                     lastEnd = matcher.end();
116                 }
117 
118                 for (String clusterId : clusterOffsets.keySet()) {
119                     Set<Integer> terms = clusterOffsets.get(clusterId);
120                     List<Span<Term>> termsList = new ArrayList<>();
121                     for (Integer termOffset : terms) {
122                         Term term = termsHashMap.get(termOffset);
123                         if (term == null) {
124                             LOGGER.error("Term is null!");
125                             continue;
126                         }
127                         Span<Term> termSpan = KAFDocument.newTermSpan();
128                         termSpan.addTarget(term);
129                         termsList.add(termSpan);
130                     }
131                     Coref coref = nafDocument.newCoref(termsList);
132                     coref.setCluster(clusterId);
133                     coref.setType("event-gold");
134                 }
135 
136                 File outputFile = new File(
137                         outputFolder.getAbsolutePath() + File.separator +
138                                 file.getAbsolutePath().substring(
139                                         inputFolder.getAbsolutePath().length()).replace(".ecb.txt", ".naf"));
140                 Files.createParentDirs(outputFile);
141                 nafDocument.save(outputFile);
142 
143             }
144 
145         } catch (Exception e) {
146             CommandLine.fail(e);
147         }
148 
149     }
150 
151 }