1 package eu.fbk.dkm.pikes.resources.goodbadfor;
2
3 import eu.fbk.dkm.pikes.resources.mpqa.Record;
4 import eu.fbk.dkm.pikes.resources.mpqa.RecordSet;
5 import eu.fbk.rdfpro.util.Statements;
6 import eu.fbk.utils.core.CommandLine;
7 import ixa.kaflib.KAFDocument;
8 import org.apache.commons.io.FileUtils;
9 import org.apache.commons.io.FilenameUtils;
10 import org.apache.commons.lang.StringEscapeUtils;
11 import org.apache.commons.lang.StringUtils;
12 import org.slf4j.Logger;
13 import org.slf4j.LoggerFactory;
14 import org.w3c.dom.Document;
15 import org.w3c.dom.Node;
16 import org.w3c.dom.NodeList;
17 import org.xml.sax.SAXException;
18
19 import javax.xml.parsers.DocumentBuilder;
20 import javax.xml.parsers.DocumentBuilderFactory;
21 import javax.xml.parsers.ParserConfigurationException;
22 import javax.xml.stream.XMLStreamException;
23 import java.io.File;
24 import java.io.IOException;
25 import java.util.Iterator;
26 import java.util.TreeMap;
27
28
29
30
31
32 public class CorpusLoader {
33
34
35 private static final Logger LOGGER = LoggerFactory.getLogger(CorpusLoader.class);
36 private static final int MIN_STR_LEN = 0;
37 private static final boolean ENABLE_EXTREME_GUESS = true;
38 public static final String DEFAULT_NAMESPACE = "http://eu.fbk.dkm.pikes.resources.mpqa.cs.pitt.edu/corpora/gfbf_corpus/";
39
40 private static int textAfterGuessingOverlap(String text, Record record, int expectedLength, String span1) {
41 return textAfterGuessingOverlap(text, record, expectedLength, span1, false);
42 }
43
44 private static int textAfterGuessingOverlap(String text, Record record, int expectedLength, String span1, boolean trim) {
45 int maxTo = Math.max(expectedLength * 3, 15);
46 for (int i = 0; i < maxTo; i++) {
47 int start = record.getSpan().begin - i;
48 String span = text.substring(start, start + expectedLength);
49
50 if (trim) {
51 span1 = span1.replaceAll("[^0-9a-zA-Z]", "");
52 span = span.replaceAll("[^0-9a-zA-Z]", "");
53 }
54
55 LOGGER.trace("Span1: {}", span);
56 LOGGER.trace("Span2: {}", span1);
57
58 if (span1.equals(span) && span.length() > MIN_STR_LEN) {
59 LOGGER.trace("Adding {}", i);
60 return i;
61 }
62 }
63
64 return -1;
65 }
66
67 public static String getTextFromGateFile(File file) throws ParserConfigurationException, IOException, SAXException {
68 DocumentBuilderFactory dbFactory = DocumentBuilderFactory.newInstance();
69 DocumentBuilder dBuilder = dbFactory.newDocumentBuilder();
70 Document doc = dBuilder.parse(file);
71 NodeList nList = doc.getElementsByTagName("TextWithNodes");
72 for (int temp = 0; temp < nList.getLength(); temp++) {
73
74 Node nNode = nList.item(temp);
75 return nNode.getTextContent();
76 }
77
78 return null;
79 }
80
81 public static void main(final String[] args) throws IOException, XMLStreamException {
82 try {
83 final CommandLine cmd = CommandLine
84 .parser()
85 .withName("eu.fbk.dkm.pikes.resources.goodbadfor-loader")
86 .withHeader("Load goodFor/badFor library")
87 .withOption("i", "input-path", "the base path of the corpus", "DIR", CommandLine.Type.DIRECTORY_EXISTING, true, false, true)
88 .withOption("n", "namespace", String.format("the namespace for generating document URIs, default %s", DEFAULT_NAMESPACE), "NS", CommandLine.Type.STRING, true, false, false)
89 .withOption("t", "test", "test only on this file", "FILE", CommandLine.Type.STRING, true, false, false)
90 .withLogger(LoggerFactory.getLogger("eu.fbk.fssa")).parse(args);
91
92 final File inputPath = cmd.getOptionValue("i", File.class);
93
94 File documentsFolder = new File(inputPath.getAbsolutePath() + File.separator + "GATE" + File.separator);
95 File annotationsFolder = new File(inputPath.getAbsolutePath() + File.separator + "MPQA" + File.separator);
96 File nafFolder = new File(inputPath.getAbsolutePath() + File.separator + "NAF" + File.separator);
97
98 String namespace = DEFAULT_NAMESPACE;
99 if (cmd.hasOption("n")) {
100 namespace = cmd.getOptionValue("n", String.class);
101 }
102
103 String testFile = cmd.getOptionValue("t", String.class);
104
105 if (!documentsFolder.exists()) {
106 LOGGER.error("Folder {} does not exist", documentsFolder.getAbsolutePath());
107 }
108 if (!annotationsFolder.exists()) {
109 LOGGER.error("Folder {} does not exist", annotationsFolder.getAbsolutePath());
110 }
111
112 if (nafFolder.exists()) {
113 LOGGER.error("Folder {} exists", nafFolder.getAbsolutePath());
114 }
115 nafFolder.mkdir();
116
117 int skippedRows = 0;
118 int totalRows = 0;
119 int docsNo = 0;
120
121 Iterator<File> fileIterator;
122 fileIterator = FileUtils.iterateFiles(documentsFolder, new String[]{"xml"}, false);
123
124 while (fileIterator.hasNext()) {
125 File file = fileIterator.next();
126 String fileBaseName = FilenameUtils.removeExtension(file.getName());
127
128 if (testFile != null && !testFile.equals(fileBaseName)) {
129 continue;
130 }
131 String nafFileName = fileBaseName + ".naf";
132 File nafFile = new File(nafFolder + File.separator + nafFileName);
133 File mpqaFile = new File(annotationsFolder.getAbsolutePath() + File.separator + fileBaseName + ".eu.fbk.dkm.pikes.resources.mpqa");
134
135 LOGGER.info(String.format("Loading file %s", mpqaFile));
136 if (!mpqaFile.exists()) {
137 LOGGER.warn("File {} does not exist", mpqaFile.getAbsolutePath());
138 continue;
139 }
140
141 String text = getTextFromGateFile(file);
142 if (text == null) {
143 LOGGER.warn("text is null");
144 continue;
145 }
146 String documentURI = namespace + nafFileName;
147
148 docsNo++;
149
150 LOGGER.trace("Original text length: {}", text.length());
151 int originalTextLength = text.length();
152
153 final RecordSet annotations = RecordSet.readFromFile(mpqaFile);
154 totalRows += annotations.getRecords().size();
155 TreeMap<Integer, Record> records = new TreeMap<>();
156 for (Record record : annotations.getRecords()) {
157 records.put(record.getSpan().begin, record);
158 }
159
160 for (Record record : records.values()) {
161 String span1 = record.getValue("span");
162 String span2 = record.getSpan().apply(text, false);
163
164 if (span1 == null || span2 == null) {
165 continue;
166 }
167
168 span1 = StringEscapeUtils.unescapeHtml(span1);
169
170 if (!span1.trim().equals(span2.trim())) {
171 int expectedLength = record.getSpan().end - record.getSpan().begin;
172
173 String span1OnlyLetters = span1.replaceAll("[^0-9a-zA-Z]", "");
174 String span2OnlyLetters = span2.replaceAll("[^0-9a-zA-Z]", "");
175
176 if (expectedLength != span1.length() && (!span1OnlyLetters.equals(span2OnlyLetters) || span1OnlyLetters.length() < MIN_STR_LEN)) {
177 LOGGER.debug("Span: {}", span1);
178 LOGGER.debug("Length: {}/{}", span1.length(), expectedLength);
179 LOGGER.debug("Text: {}", span2);
180
181 if (ENABLE_EXTREME_GUESS) {
182 int offset = textAfterGuessingOverlap(text, record, expectedLength, span1, true);
183 text = new StringBuilder(text).insert(record.getSpan().begin - offset, StringUtils.repeat(" ", offset)).toString();
184 LOGGER.debug("Guessed offset: {}", offset);
185 continue;
186 }
187
188
189 skippedRows++;
190 continue;
191 }
192
193
194
195
196
197
198
199
200
201 if (span1OnlyLetters.equals(span2OnlyLetters)) {
202 LOGGER.trace("Identical unless blanks - {}", span1OnlyLetters);
203 continue;
204 }
205
206
207 int offset = textAfterGuessingOverlap(text, record, expectedLength, span1);
208 if (offset != -1) {
209 text = new StringBuilder(text).insert(record.getSpan().begin - offset, StringUtils.repeat(" ", offset)).toString();
210 }
211 else {
212 skippedRows++;
213 LOGGER.warn("Span not found: {}", record.toString());
214 }
215 }
216 }
217
218 LOGGER.trace("Final text length: {}", text.length());
219 int diff = text.length() - originalTextLength;
220 if (diff != 0) {
221 LOGGER.debug("Difference in length: {}", diff);
222 }
223
224 text = text.replaceAll("\\s", " ");
225
226 final KAFDocument document = new KAFDocument("en", "v3");
227 document.setRawText(text);
228 document.createPublic();
229 document.getPublic().publicId = Statements.VALUE_FACTORY.createIRI(documentURI).getLocalName();
230 document.getPublic().uri = documentURI;
231 document.createFileDesc();
232 document.getFileDesc().filename = nafFileName;
233 document.getFileDesc().title = "-";
234 document.save(nafFile.getAbsolutePath());
235 }
236
237 LOGGER.info("=== Statistics ===");
238 LOGGER.info("Total documents: {}", docsNo);
239 LOGGER.info("Total rows: {}", totalRows);
240 LOGGER.info("Skipped rows: {}", skippedRows);
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255 } catch (final Throwable ex) {
256 CommandLine.fail(ex);
257 }
258 }
259
260
261 }