1   package eu.fbk.dkm.pikes.resources.goodbadfor;
2   
3   import eu.fbk.dkm.pikes.resources.mpqa.Record;
4   import eu.fbk.dkm.pikes.resources.mpqa.RecordSet;
5   import eu.fbk.rdfpro.util.Statements;
6   import eu.fbk.utils.core.CommandLine;
7   import ixa.kaflib.KAFDocument;
8   import org.apache.commons.io.FileUtils;
9   import org.apache.commons.io.FilenameUtils;
10  import org.apache.commons.lang.StringEscapeUtils;
11  import org.apache.commons.lang.StringUtils;
12  import org.slf4j.Logger;
13  import org.slf4j.LoggerFactory;
14  import org.w3c.dom.Document;
15  import org.w3c.dom.Node;
16  import org.w3c.dom.NodeList;
17  import org.xml.sax.SAXException;
18  
19  import javax.xml.parsers.DocumentBuilder;
20  import javax.xml.parsers.DocumentBuilderFactory;
21  import javax.xml.parsers.ParserConfigurationException;
22  import javax.xml.stream.XMLStreamException;
23  import java.io.File;
24  import java.io.IOException;
25  import java.util.Iterator;
26  import java.util.TreeMap;
27  
28  /**
29   * Created by alessio on 24/03/15.
30   */
31  
32  public class CorpusLoader {
33  
34  	//	static Logger logger = Logger.getLogger(CorpusLoader.class.getName());
35  	private static final Logger LOGGER = LoggerFactory.getLogger(CorpusLoader.class);
36  	private static final int MIN_STR_LEN = 0;
37  	private static final boolean ENABLE_EXTREME_GUESS = true;
38  	public static final String DEFAULT_NAMESPACE = "http://eu.fbk.dkm.pikes.resources.mpqa.cs.pitt.edu/corpora/gfbf_corpus/";
39  
40  	private static int textAfterGuessingOverlap(String text, Record record, int expectedLength, String span1) {
41  		return textAfterGuessingOverlap(text, record, expectedLength, span1, false);
42  	}
43  
44  	private static int textAfterGuessingOverlap(String text, Record record, int expectedLength, String span1, boolean trim) {
45  		int maxTo = Math.max(expectedLength * 3, 15);
46  		for (int i = 0; i < maxTo; i++) {
47  			int start = record.getSpan().begin - i;
48  			String span = text.substring(start, start + expectedLength);
49  
50  			if (trim) {
51  				span1 = span1.replaceAll("[^0-9a-zA-Z]", "");
52  				span = span.replaceAll("[^0-9a-zA-Z]", "");
53  			}
54  
55  			LOGGER.trace("Span1: {}", span);
56  			LOGGER.trace("Span2: {}", span1);
57  
58  			if (span1.equals(span) && span.length() > MIN_STR_LEN) {
59  				LOGGER.trace("Adding {}", i);
60  				return i;
61  			}
62  		}
63  
64  		return -1;
65  	}
66  
67  	public static String getTextFromGateFile(File file) throws ParserConfigurationException, IOException, SAXException {
68  		DocumentBuilderFactory dbFactory = DocumentBuilderFactory.newInstance();
69  		DocumentBuilder dBuilder = dbFactory.newDocumentBuilder();
70  		Document doc = dBuilder.parse(file);
71  		NodeList nList = doc.getElementsByTagName("TextWithNodes");
72  		for (int temp = 0; temp < nList.getLength(); temp++) {
73  
74  			Node nNode = nList.item(temp);
75  			return nNode.getTextContent();
76  		}
77  
78  		return null;
79  	}
80  
81  	public static void main(final String[] args) throws IOException, XMLStreamException {
82  		try {
83  			final CommandLine cmd = CommandLine
84  					.parser()
85  					.withName("eu.fbk.dkm.pikes.resources.goodbadfor-loader")
86  					.withHeader("Load goodFor/badFor library")
87  					.withOption("i", "input-path", "the base path of the corpus", "DIR", CommandLine.Type.DIRECTORY_EXISTING, true, false, true)
88  					.withOption("n", "namespace", String.format("the namespace for generating document URIs, default %s", DEFAULT_NAMESPACE), "NS", CommandLine.Type.STRING, true, false, false)
89  					.withOption("t", "test", "test only on this file", "FILE", CommandLine.Type.STRING, true, false, false)
90  					.withLogger(LoggerFactory.getLogger("eu.fbk.fssa")).parse(args);
91  
92  			final File inputPath = cmd.getOptionValue("i", File.class);
93  
94  			File documentsFolder = new File(inputPath.getAbsolutePath() + File.separator + "GATE" + File.separator);
95  			File annotationsFolder = new File(inputPath.getAbsolutePath() + File.separator + "MPQA" + File.separator);
96  			File nafFolder = new File(inputPath.getAbsolutePath() + File.separator + "NAF" + File.separator);
97  
98  			String namespace = DEFAULT_NAMESPACE;
99  			if (cmd.hasOption("n")) {
100 				namespace = cmd.getOptionValue("n", String.class);
101 			}
102 
103 			String testFile = cmd.getOptionValue("t", String.class);
104 
105 			if (!documentsFolder.exists()) {
106 				LOGGER.error("Folder {} does not exist", documentsFolder.getAbsolutePath());
107 			}
108 			if (!annotationsFolder.exists()) {
109 				LOGGER.error("Folder {} does not exist", annotationsFolder.getAbsolutePath());
110 			}
111 
112 			if (nafFolder.exists()) {
113 				LOGGER.error("Folder {} exists", nafFolder.getAbsolutePath());
114 			}
115 			nafFolder.mkdir();
116 
117 			int skippedRows = 0;
118 			int totalRows = 0;
119 			int docsNo = 0;
120 
121 			Iterator<File> fileIterator;
122 			fileIterator = FileUtils.iterateFiles(documentsFolder, new String[]{"xml"}, false);
123 
124 			while (fileIterator.hasNext()) {
125 				File file = fileIterator.next();
126 				String fileBaseName = FilenameUtils.removeExtension(file.getName());
127 
128 				if (testFile != null && !testFile.equals(fileBaseName)) {
129 					continue;
130 				}
131 				String nafFileName = fileBaseName + ".naf";
132 				File nafFile = new File(nafFolder + File.separator + nafFileName);
133 				File mpqaFile = new File(annotationsFolder.getAbsolutePath() + File.separator + fileBaseName + ".eu.fbk.dkm.pikes.resources.mpqa");
134 
135 				LOGGER.info(String.format("Loading file %s", mpqaFile));
136 				if (!mpqaFile.exists()) {
137 					LOGGER.warn("File {} does not exist", mpqaFile.getAbsolutePath());
138 					continue;
139 				}
140 
141 				String text = getTextFromGateFile(file);
142 				if (text == null) {
143 					LOGGER.warn("text is null");
144 					continue;
145 				}
146 				String documentURI = namespace + nafFileName;
147 
148 				docsNo++;
149 
150 				LOGGER.trace("Original text length: {}", text.length());
151 				int originalTextLength = text.length();
152 
153 				final RecordSet annotations = RecordSet.readFromFile(mpqaFile);
154 				totalRows += annotations.getRecords().size();
155 				TreeMap<Integer, Record> records = new TreeMap<>();
156 				for (Record record : annotations.getRecords()) {
157 					records.put(record.getSpan().begin, record);
158 				}
159 
160 				for (Record record : records.values()) {
161 					String span1 = record.getValue("span");
162 					String span2 = record.getSpan().apply(text, false);
163 
164 					if (span1 == null || span2 == null) {
165 						continue;
166 					}
167 
168 					span1 = StringEscapeUtils.unescapeHtml(span1);
169 
170 					if (!span1.trim().equals(span2.trim())) {
171 						int expectedLength = record.getSpan().end - record.getSpan().begin;
172 
173 						String span1OnlyLetters = span1.replaceAll("[^0-9a-zA-Z]", "");
174 						String span2OnlyLetters = span2.replaceAll("[^0-9a-zA-Z]", "");
175 
176 						if (expectedLength != span1.length() && (!span1OnlyLetters.equals(span2OnlyLetters) || span1OnlyLetters.length() < MIN_STR_LEN)) {
177 							LOGGER.debug("Span: {}", span1);
178 							LOGGER.debug("Length: {}/{}", span1.length(), expectedLength);
179 							LOGGER.debug("Text: {}", span2);
180 
181 							if (ENABLE_EXTREME_GUESS) {
182 								int offset = textAfterGuessingOverlap(text, record, expectedLength, span1, true);
183 								text = new StringBuilder(text).insert(record.getSpan().begin - offset, StringUtils.repeat(" ", offset)).toString();
184 								LOGGER.debug("Guessed offset: {}", offset);
185 								continue;
186 							}
187 
188 							// Skip
189 							skippedRows++;
190 							continue;
191 						}
192 //						System.out.println("DIFF");
193 //						System.out.println(record.getSpan());
194 //						System.out.println(expectedLength);
195 //						System.out.println(span1);
196 //						System.out.println(span1.length());
197 //						System.out.println(span2);
198 //						System.out.println(span2.length());
199 //						System.out.println();
200 
201 						if (span1OnlyLetters.equals(span2OnlyLetters)) {
202 							LOGGER.trace("Identical unless blanks - {}", span1OnlyLetters);
203 							continue;
204 						}
205 
206 						// Guessing overlap
207 						int offset = textAfterGuessingOverlap(text, record, expectedLength, span1);
208 						if (offset != -1) {
209 							text = new StringBuilder(text).insert(record.getSpan().begin - offset, StringUtils.repeat(" ", offset)).toString();
210 						}
211 						else {
212 							skippedRows++;
213 							LOGGER.warn("Span not found: {}", record.toString());
214 						}
215 					}
216 				}
217 
218 				LOGGER.trace("Final text length: {}", text.length());
219 				int diff = text.length() - originalTextLength;
220 				if (diff != 0) {
221 					LOGGER.debug("Difference in length: {}", diff);
222 				}
223 
224 				text = text.replaceAll("\\s", "&nbsp;");
225 
226 				final KAFDocument document = new KAFDocument("en", "v3");
227 				document.setRawText(text);
228 				document.createPublic();
229 				document.getPublic().publicId = Statements.VALUE_FACTORY.createIRI(documentURI).getLocalName();
230 				document.getPublic().uri = documentURI;
231 				document.createFileDesc();
232 				document.getFileDesc().filename = nafFileName;
233 				document.getFileDesc().title = "-";
234 				document.save(nafFile.getAbsolutePath());
235 			}
236 
237 			LOGGER.info("=== Statistics ===");
238 			LOGGER.info("Total documents: {}", docsNo);
239 			LOGGER.info("Total rows: {}", totalRows);
240 			LOGGER.info("Skipped rows: {}", skippedRows);
241 
242 //			fileIterator = FileUtils.iterateFiles(annotationsFolder, new String[]{"eu.fbk.dkm.pikes.resources.mpqa"}, false);
243 //			while (fileIterator.hasNext()) {
244 //				File file = fileIterator.next();
245 //				String fileBaseName = FilenameUtils.removeExtension(file.getName());
246 //				final RecordSet annotations = RecordSet.readFromFile(file);
247 //				for (Record record : annotations.getRecords()) {
248 //					System.out.println(fileBaseName);
249 //					System.out.println(record.getSpan());
250 //					System.out.println(record.getSpan().apply(texts.get(fileBaseName), false));
251 //					System.out.println();
252 //				}
253 //			}
254 
255 		} catch (final Throwable ex) {
256 			CommandLine.fail(ex);
257 		}
258 	}
259 
260 
261 }