1 package eu.fbk.dkm.pikes.resources.ecb;
2
3 import com.google.common.collect.HashMultimap;
4 import com.google.common.io.Files;
5 import eu.fbk.utils.core.CommandLine;
6 import eu.fbk.utils.core.IO;
7 import ixa.kaflib.Coref;
8 import ixa.kaflib.KAFDocument;
9 import ixa.kaflib.Span;
10 import ixa.kaflib.Term;
11 import org.slf4j.Logger;
12 import org.slf4j.LoggerFactory;
13 import org.w3c.dom.Document;
14 import org.w3c.dom.Element;
15 import org.w3c.dom.Node;
16 import org.w3c.dom.NodeList;
17
18 import javax.xml.parsers.DocumentBuilder;
19 import javax.xml.parsers.DocumentBuilderFactory;
20 import javax.xml.xpath.XPath;
21 import javax.xml.xpath.XPathConstants;
22 import javax.xml.xpath.XPathExpression;
23 import javax.xml.xpath.XPathFactory;
24 import java.io.BufferedReader;
25 import java.io.File;
26 import java.io.FileInputStream;
27 import java.io.InputStreamReader;
28 import java.util.*;
29 import java.util.zip.GZIPInputStream;
30
31
32
33
34
35 public class MergeECBPlus {
36
37 private static final Logger LOGGER = LoggerFactory.getLogger(MergeECBPlus.class);
38
39 public static void main(String[] args) {
40 try {
41 final CommandLine cmd = CommandLine
42 .parser()
43 .withName("./ecbplus-merger")
44 .withHeader("Add mentions to NAF folder for ECB resource")
45 .withOption("i", "input-xml", "Input XML folder", "FOLDER",
46 CommandLine.Type.DIRECTORY_EXISTING, true, false, true)
47 .withOption("n", "input-naf", "Input NAF folder", "FOLDER",
48 CommandLine.Type.DIRECTORY_EXISTING, true, false, true)
49 .withOption("o", "output", "Output folder", "FOLDER",
50 CommandLine.Type.DIRECTORY, true, false, true)
51 .withLogger(LoggerFactory.getLogger("eu.fbk")).parse(args);
52
53 File inputFolder = cmd.getOptionValue("input-xml", File.class);
54 File nafFolder = cmd.getOptionValue("input-naf", File.class);
55 File outputFolder = cmd.getOptionValue("output", File.class);
56
57 if (!outputFolder.exists()) {
58 outputFolder.mkdirs();
59 }
60
61 File[] files = inputFolder.listFiles();
62 for (File file : files) {
63 if (!file.isDirectory()) {
64 continue;
65 }
66
67 File[] thisFolderFiles = file.listFiles();
68 for (File ecbFile : thisFolderFiles) {
69 if (!ecbFile.isFile()) {
70 continue;
71 }
72 if (!ecbFile.getAbsolutePath().endsWith(".xml")) {
73 continue;
74 }
75
76 String relativeFilePath = ecbFile.getAbsolutePath()
77 .substring(inputFolder.getAbsolutePath().length());
78 if (relativeFilePath.startsWith(File.separator)) {
79 relativeFilePath = relativeFilePath.substring(1);
80 }
81 String naf = nafFolder.getAbsolutePath() + File.separator + relativeFilePath + ".naf.gz";
82 File nafFile = new File(naf);
83
84 DocumentBuilderFactory dbFactory = DocumentBuilderFactory.newInstance();
85 DocumentBuilder dBuilder = dbFactory.newDocumentBuilder();
86 XPathFactory xPathfactory = XPathFactory.newInstance();
87 XPath xpath = xPathfactory.newXPath();
88
89 XPathExpression expr;
90 NodeList nl;
91
92 Document doc = dBuilder.parse(IO.read(ecbFile.getAbsolutePath()));
93 doc.getDocumentElement().normalize();
94
95 Map<Integer, Integer> offsets = new HashMap<>();
96
97 HashMultimap<String, Integer> clusterOffsets = HashMultimap.create();
98
99
100 expr = xpath.compile("/Document/token");
101 nl = (NodeList) expr.evaluate(doc, XPathConstants.NODESET);
102
103 StringBuffer buffer = new StringBuffer();
104 StringBuffer text = new StringBuffer();
105 int lastSent = 0;
106 int offset = 0;
107 for (int i = 0; i < nl.getLength(); i++) {
108 Node item = nl.item(i);
109 Element element = (Element) item;
110 String token = element.getTextContent();
111
112 int t_id = Integer.parseInt(element.getAttribute("t_id"));
113 offsets.put(t_id, offset);
114
115 int sentence = Integer.parseInt(element.getAttribute("sentence"));
116 if (relativeFilePath.contains("ecbplus") && sentence == 0) {
117 continue;
118 }
119 if (sentence != lastSent) {
120 if (buffer.length() > 0) {
121 text.append(buffer.toString().trim()).append("\n");
122 }
123 buffer = new StringBuffer();
124 lastSent = sentence;
125 }
126
127 buffer.append(token).append(" ");
128 offset = text.length() + buffer.length();
129 }
130 if (buffer.length() > 0) {
131 text.append(buffer.toString().trim()).append("\n");
132 }
133
134 expr = xpath.compile("/Document/Markables/ACTION_OCCURRENCE");
135 nl = (NodeList) expr.evaluate(doc, XPathConstants.NODESET);
136 for (int i = 0; i < nl.getLength(); i++) {
137 Node item = nl.item(i);
138 Element element = (Element) item;
139
140 String clusterID = element.getAttribute("m_id");
141
142 NodeList elements = element.getElementsByTagName("token_anchor");
143 for (int j = 0; j < elements.getLength(); j++) {
144 Node item2 = elements.item(j);
145 Element element2 = (Element) item2;
146
147 int t_id = Integer.parseInt(element2.getAttribute("t_id"));
148 clusterOffsets.put(clusterID, offsets.get(t_id));
149 break;
150 }
151
152 }
153
154 FileInputStream bais = new FileInputStream(nafFile);
155 GZIPInputStream gzis = new GZIPInputStream(bais);
156 InputStreamReader reader = new InputStreamReader(gzis);
157 BufferedReader in = new BufferedReader(reader);
158
159 KAFDocument nafDocument = KAFDocument.createFromStream(in);
160
161 Map<Integer, Term> termsHashMap = new HashMap<>();
162 for (Term term : nafDocument.getTerms()) {
163 termsHashMap.put(term.getOffset(), term);
164 }
165
166 for (String clusterId : clusterOffsets.keySet()) {
167 Set<Integer> terms = clusterOffsets.get(clusterId);
168 List<Span<Term>> termsList = new ArrayList<>();
169 for (Integer termOffset : terms) {
170 Term term = termsHashMap.get(termOffset);
171 if (term == null) {
172 LOGGER.error("Term is null!");
173 continue;
174 }
175 Span<Term> termSpan = KAFDocument.newTermSpan();
176 termSpan.addTarget(term);
177 termsList.add(termSpan);
178 }
179
180 if (termsList.size() == 0) {
181 continue;
182 }
183
184 Coref coref = nafDocument.newCoref(termsList);
185 coref.setCluster(clusterId);
186 coref.setType("event-gold");
187 }
188
189 String outFileName = outputFolder.getAbsolutePath() + File.separator + relativeFilePath + ".naf";
190 File outputFile = new File(outFileName);
191 Files.createParentDirs(outputFile);
192 nafDocument.save(outputFile);
193 }
194 }
195 } catch (Exception e) {
196 CommandLine.fail(e);
197 }
198 }
199 }