1   package eu.fbk.dkm.pikes.resources.ecb;
2   
3   import com.google.common.collect.HashMultimap;
4   import com.google.common.io.Files;
5   import eu.fbk.utils.core.CommandLine;
6   import eu.fbk.utils.core.IO;
7   import ixa.kaflib.Coref;
8   import ixa.kaflib.KAFDocument;
9   import ixa.kaflib.Span;
10  import ixa.kaflib.Term;
11  import org.slf4j.Logger;
12  import org.slf4j.LoggerFactory;
13  import org.w3c.dom.Document;
14  import org.w3c.dom.Element;
15  import org.w3c.dom.Node;
16  import org.w3c.dom.NodeList;
17  
18  import javax.xml.parsers.DocumentBuilder;
19  import javax.xml.parsers.DocumentBuilderFactory;
20  import javax.xml.xpath.XPath;
21  import javax.xml.xpath.XPathConstants;
22  import javax.xml.xpath.XPathExpression;
23  import javax.xml.xpath.XPathFactory;
24  import java.io.BufferedReader;
25  import java.io.File;
26  import java.io.FileInputStream;
27  import java.io.InputStreamReader;
28  import java.util.*;
29  import java.util.zip.GZIPInputStream;
30  
31  /**
32   * Created by alessio on 28/09/16.
33   */
34  
35  public class MergeECBPlus {
36  
37      private static final Logger LOGGER = LoggerFactory.getLogger(MergeECBPlus.class);
38  
39      public static void main(String[] args) {
40          try {
41              final CommandLine cmd = CommandLine
42                      .parser()
43                      .withName("./ecbplus-merger")
44                      .withHeader("Add mentions to NAF folder for ECB resource")
45                      .withOption("i", "input-xml", "Input XML folder", "FOLDER",
46                              CommandLine.Type.DIRECTORY_EXISTING, true, false, true)
47                      .withOption("n", "input-naf", "Input NAF folder", "FOLDER",
48                              CommandLine.Type.DIRECTORY_EXISTING, true, false, true)
49                      .withOption("o", "output", "Output folder", "FOLDER",
50                              CommandLine.Type.DIRECTORY, true, false, true)
51                      .withLogger(LoggerFactory.getLogger("eu.fbk")).parse(args);
52  
53              File inputFolder = cmd.getOptionValue("input-xml", File.class);
54              File nafFolder = cmd.getOptionValue("input-naf", File.class);
55              File outputFolder = cmd.getOptionValue("output", File.class);
56  
57              if (!outputFolder.exists()) {
58                  outputFolder.mkdirs();
59              }
60  
61              File[] files = inputFolder.listFiles();
62              for (File file : files) {
63                  if (!file.isDirectory()) {
64                      continue;
65                  }
66  
67                  File[] thisFolderFiles = file.listFiles();
68                  for (File ecbFile : thisFolderFiles) {
69                      if (!ecbFile.isFile()) {
70                          continue;
71                      }
72                      if (!ecbFile.getAbsolutePath().endsWith(".xml")) {
73                          continue;
74                      }
75  
76                      String relativeFilePath = ecbFile.getAbsolutePath()
77                              .substring(inputFolder.getAbsolutePath().length());
78                      if (relativeFilePath.startsWith(File.separator)) {
79                          relativeFilePath = relativeFilePath.substring(1);
80                      }
81                      String naf = nafFolder.getAbsolutePath() + File.separator + relativeFilePath + ".naf.gz";
82                      File nafFile = new File(naf);
83  
84                      DocumentBuilderFactory dbFactory = DocumentBuilderFactory.newInstance();
85                      DocumentBuilder dBuilder = dbFactory.newDocumentBuilder();
86                      XPathFactory xPathfactory = XPathFactory.newInstance();
87                      XPath xpath = xPathfactory.newXPath();
88  
89                      XPathExpression expr;
90                      NodeList nl;
91  
92                      Document doc = dBuilder.parse(IO.read(ecbFile.getAbsolutePath()));
93                      doc.getDocumentElement().normalize();
94  
95                      Map<Integer, Integer> offsets = new HashMap<>();
96  //                    Map<Integer, Integer> anchors = new HashMap<>();
97                      HashMultimap<String, Integer> clusterOffsets = HashMultimap.create();
98  
99                      // Normalization rules
100                     expr = xpath.compile("/Document/token");
101                     nl = (NodeList) expr.evaluate(doc, XPathConstants.NODESET);
102 
103                     StringBuffer buffer = new StringBuffer();
104                     StringBuffer text = new StringBuffer();
105                     int lastSent = 0;
106                     int offset = 0;
107                     for (int i = 0; i < nl.getLength(); i++) {
108                         Node item = nl.item(i);
109                         Element element = (Element) item;
110                         String token = element.getTextContent();
111 
112                         int t_id = Integer.parseInt(element.getAttribute("t_id"));
113                         offsets.put(t_id, offset);
114 
115                         int sentence = Integer.parseInt(element.getAttribute("sentence"));
116                         if (relativeFilePath.contains("ecbplus") && sentence == 0) {
117                             continue;
118                         }
119                         if (sentence != lastSent) {
120                             if (buffer.length() > 0) {
121                                 text.append(buffer.toString().trim()).append("\n");
122                             }
123                             buffer = new StringBuffer();
124                             lastSent = sentence;
125                         }
126 
127                         buffer.append(token).append(" ");
128                         offset = text.length() + buffer.length();
129                     }
130                     if (buffer.length() > 0) {
131                         text.append(buffer.toString().trim()).append("\n");
132                     }
133 
134                     expr = xpath.compile("/Document/Markables/ACTION_OCCURRENCE");
135                     nl = (NodeList) expr.evaluate(doc, XPathConstants.NODESET);
136                     for (int i = 0; i < nl.getLength(); i++) {
137                         Node item = nl.item(i);
138                         Element element = (Element) item;
139 
140                         String clusterID = element.getAttribute("m_id");
141 
142                         NodeList elements = element.getElementsByTagName("token_anchor");
143                         for (int j = 0; j < elements.getLength(); j++) {
144                             Node item2 = elements.item(j);
145                             Element element2 = (Element) item2;
146 
147                             int t_id = Integer.parseInt(element2.getAttribute("t_id"));
148                             clusterOffsets.put(clusterID, offsets.get(t_id));
149                             break;
150                         }
151 
152                     }
153 
154                     FileInputStream bais = new FileInputStream(nafFile);
155                     GZIPInputStream gzis = new GZIPInputStream(bais);
156                     InputStreamReader reader = new InputStreamReader(gzis);
157                     BufferedReader in = new BufferedReader(reader);
158 
159                     KAFDocument nafDocument = KAFDocument.createFromStream(in);
160 
161                     Map<Integer, Term> termsHashMap = new HashMap<>();
162                     for (Term term : nafDocument.getTerms()) {
163                         termsHashMap.put(term.getOffset(), term);
164                     }
165 
166                     for (String clusterId : clusterOffsets.keySet()) {
167                         Set<Integer> terms = clusterOffsets.get(clusterId);
168                         List<Span<Term>> termsList = new ArrayList<>();
169                         for (Integer termOffset : terms) {
170                             Term term = termsHashMap.get(termOffset);
171                             if (term == null) {
172                                 LOGGER.error("Term is null!");
173                                 continue;
174                             }
175                             Span<Term> termSpan = KAFDocument.newTermSpan();
176                             termSpan.addTarget(term);
177                             termsList.add(termSpan);
178                         }
179 
180                         if (termsList.size() == 0) {
181                             continue;
182                         }
183 
184                         Coref coref = nafDocument.newCoref(termsList);
185                         coref.setCluster(clusterId);
186                         coref.setType("event-gold");
187                     }
188 
189                     String outFileName = outputFolder.getAbsolutePath() + File.separator + relativeFilePath + ".naf";
190                     File outputFile = new File(outFileName);
191                     Files.createParentDirs(outputFile);
192                     nafDocument.save(outputFile);
193                 }
194             }
195         } catch (Exception e) {
196             CommandLine.fail(e);
197         }
198     }
199 }