1 package eu.fbk.dkm.pikes.resources.mpqa;
2
3 import com.google.common.io.Files;
4 import eu.fbk.rdfpro.util.Statements;
5 import eu.fbk.utils.core.CommandLine;
6 import ixa.kaflib.KAFDocument;
7 import org.slf4j.LoggerFactory;
8 import org.w3c.dom.Document;
9 import org.w3c.dom.Element;
10 import org.w3c.dom.Node;
11 import org.w3c.dom.NodeList;
12
13 import javax.xml.parsers.DocumentBuilder;
14 import javax.xml.parsers.DocumentBuilderFactory;
15 import javax.xml.stream.XMLStreamException;
16 import java.io.File;
17 import java.io.IOException;
18 import java.util.HashMap;
19 import java.util.LinkedHashMap;
20
21
22
23
24
25 public class JohanssonPreprocessor {
26
27 private static final org.slf4j.Logger LOGGER = LoggerFactory.getLogger(JohanssonPreprocessor.class);
28 public static final String DEFAULT_NAMESPACE = "http://eu.fbk.dkm.pikes.resources.mpqa.cs.pitt.edu/corpora/mpqa_corpus/";
29
30 private static class Span {
31 private int start, end, id;
32 private String value;
33
34 public Span(int start, int end, int id, String value) {
35 this.start = start;
36 this.end = end;
37 this.id = id;
38 this.value = value;
39 }
40
41 public int getStart() {
42 return start;
43 }
44
45 public int getEnd() {
46 return end;
47 }
48
49 public int getId() {
50 return id;
51 }
52
53 public String getValue() {
54 return value;
55 }
56
57 @Override
58 public String toString() {
59 return "Span{" +
60 "start=" + start +
61 ", end=" + end +
62 ", id=" + id +
63 ", value='" + value + '\'' +
64 '}';
65 }
66 }
67
68 public static void main(final String[] args) throws IOException, XMLStreamException {
69 try {
70 final CommandLine cmd = CommandLine
71 .parser()
72 .withName("corpus-preprocessor")
73 .withHeader(
74 "Produces NAF files starting from the MPQA v.2 corpus preprocessed by Johansson/Moschitti.")
75 .withOption("i", "input-path", "the base path of the Johansson MPQA corpus", "DIR",
76 CommandLine.Type.DIRECTORY_EXISTING, true, false, true)
77 .withOption("o", "output",
78 "the output path where to save produced files",
79 "DIR", CommandLine.Type.DIRECTORY_EXISTING, true, false, true)
80 .withOption("n", "namespace",
81 String.format("the namespace for generating document URIs, default %s", DEFAULT_NAMESPACE),
82 "NS", CommandLine.Type.STRING, true, false, false)
83 .withOption("doc", "doc", "Check only one document", "URL", CommandLine.Type.STRING, true, false, false)
84 .withLogger(LoggerFactory.getLogger("eu.fbk.fssa")).parse(args);
85
86 final File inputPath = cmd.getOptionValue("i", File.class);
87
88 final File outputPath = cmd.getOptionValue("o", File.class);
89 if (!outputPath.exists()) {
90 outputPath.mkdirs();
91 }
92
93 String namespace = DEFAULT_NAMESPACE;
94 if (cmd.hasOption("n")) {
95 namespace = cmd.getOptionValue("n", String.class);
96 }
97
98 String checkOneDoc = cmd.getOptionValue("doc", String.class);
99
100 DocumentBuilderFactory dbFactory = DocumentBuilderFactory.newInstance();
101 DocumentBuilder dBuilder = dbFactory.newDocumentBuilder();
102
103 for (File f : Files.fileTreeTraverser().preOrderTraversal(inputPath)) {
104
105
106 if (!f.getAbsolutePath().endsWith("tokens.xml")) {
107 continue;
108 }
109
110 final String name = f.getName().replace('/', '_');
111 final String documentURI = namespace + name;
112
113 if (checkOneDoc != null && !checkOneDoc.equals(documentURI)) {
114 continue;
115 }
116
117 Document doc = dBuilder.parse(f);
118
119 NodeList nList = doc.getElementsByTagName("annotation");
120
121 HashMap<String, LinkedHashMap<Integer, Span>> spans = new HashMap<>();
122
123 for (int temp = 0; temp < nList.getLength(); temp++) {
124 Node nNode = nList.item(temp);
125 if (nNode.getNodeType() != Node.ELEMENT_NODE) {
126 continue;
127 }
128
129 Element eElement = (Element) nNode;
130
131 String provides = eElement.getAttribute("provides");
132 if (spans.get(provides) == null) {
133 spans.put(provides, new LinkedHashMap<>());
134 }
135
136 NodeList eS = eElement.getElementsByTagName("e");
137 for (int spanID = 0; spanID < eS.getLength(); spanID++) {
138 Node span = eS.item(spanID);
139 if (span.getNodeType() != Node.ELEMENT_NODE) {
140 continue;
141 }
142
143 Element eSpan = (Element) span;
144
145 Integer id = Integer.parseInt(eSpan.getAttribute("id"));
146 Integer start = Integer.parseInt(eSpan.getAttribute("start").replaceAll("#", ""));
147 Integer end = Integer.parseInt(eSpan.getAttribute("end").replaceAll("#", ""));
148 String value = eSpan.getTextContent();
149
150 Span s = new Span(start, end, id, value);
151 spans.get(provides).put(s.id, s);
152 }
153 }
154
155 StringBuffer buffer = new StringBuffer();
156
157 Integer lastToken = 0;
158 for (Span span : spans.get("SENTENCES").values()) {
159 if (span.start != lastToken + 1) {
160 LOGGER.warn("Missing sentence [{}/{}]", f.getName(), span.start);
161 for (int i = lastToken + 1; i < span.start; i++) {
162 String token = spans.get("TOKENS").get(i).getValue();
163 token = token.replace(' ', '_');
164 token = token.replace('<', '_');
165 token = token.replace('>', '_');
166 buffer.append(token).append(" ");
167 }
168 buffer.append("\n");
169 }
170 for (int i = span.start; i <= span.end; i++) {
171 String token = spans.get("TOKENS").get(i).getValue();
172 token = token.replace(' ', '_');
173 token = token.replace('<', '_');
174 token = token.replace('>', '_');
175 buffer.append(token).append(" ");
176 }
177 lastToken = span.end;
178 buffer.append("\n");
179 }
180
181 String text = buffer.toString();
182
183 File nafFile = new File(outputPath.getAbsolutePath() + File.separator + name);
184
185 final KAFDocument document = new KAFDocument("en", "v3");
186
187 document.setRawText(text);
188
189 document.createPublic();
190 document.getPublic().publicId = Statements.VALUE_FACTORY.createIRI(documentURI).getLocalName();
191 document.getPublic().uri = documentURI;
192
193 document.createFileDesc();
194
195
196
197
198
199
200 document.save(nafFile.getAbsolutePath());
201
202 }
203
204
205 } catch (final Throwable ex) {
206 CommandLine.fail(ex);
207 }
208 }
209
210 }