1   package eu.fbk.dkm.pikes.resources.mpqa;
2   
3   import com.google.common.base.Charsets;
4   import com.google.common.base.Joiner;
5   import com.google.common.collect.HashMultimap;
6   import com.google.common.collect.Lists;
7   import com.google.common.collect.Multimap;
8   import com.google.common.collect.Sets;
9   import com.google.common.html.HtmlEscapers;
10  import com.google.common.io.CharStreams;
11  import com.google.common.io.Files;
12  import eu.fbk.rdfpro.util.Statements;
13  import eu.fbk.utils.core.CommandLine;
14  import eu.fbk.rdfpro.util.IO;
15  import ixa.kaflib.KAFDocument;
16  import org.slf4j.Logger;
17  import org.slf4j.LoggerFactory;
18  
19  import javax.annotation.Nullable;
20  import javax.xml.stream.XMLStreamException;
21  import java.io.*;
22  import java.nio.file.Paths;
23  import java.util.*;
24  import java.util.concurrent.atomic.AtomicInteger;
25  
26  public class CorpusPreprocessor {
27  
28  	private static final Logger LOGGER = LoggerFactory.getLogger(CorpusPreprocessor.class);
29  
30  	private static final String NEWLINE = "
";
31  	private static final String DEFAULT_DOCS_LIST = "doclist.all";
32  	private static final String DEFAULT_NAF_DIR = "NAF";
33  	public static final String DEFAULT_NAMESPACE = "http://eu.fbk.dkm.pikes.resources.mpqa.cs.pitt.edu/corpora/mpqa_corpus/";
34  	public static final String DEFAULT_ANNOTATION_TSV = "annotations.tsv";
35  	public static final String DEFAULT_ANNOTATION_HTML = "annotations.html";
36  
37  	private static final String[] DSA_FIELDS = new String[]{"implicit", "insubstantial",
38  			"polarity", "intensity", "expression-intensity", "annotation-uncertain",
39  			"subjective-uncertain"};
40  
41  	private static final String[] TSV_FIELDS = new String[]{"sentiment", "intensity",
42  			"attitude", "target", "source", "source-local", "sentence", "dsa-implicit",
43  			"dsa-insubstantial", "dsa-polarity", "dsa-intensity", "dsa-expression-intensity",
44  			"dsa-annotation-uncertain", "dsa-subjective-uncertain", "type", "id", "expression"};
45  
46  	private static final String[] MULTI_FIELDS = new String[]{"nested-source", "attitude-link"};
47  
48  	public static void main(final String[] args) throws IOException, XMLStreamException {
49  		try {
50  			final CommandLine cmd = CommandLine
51  					.parser()
52  					.withName("corpus-preprocessor")
53  					.withHeader(
54  							"Produces NAF files, a TSV file with sentiment annotations "
55  									+ "and an HTML file with annotated sentences "
56  									+ "starting from the MPQA v.2 corpus")
57  					.withOption("i", "input-path", "the base path of the MPQA corpus", "DIR",
58  							CommandLine.Type.DIRECTORY_EXISTING, true, false, true)
59  					.withOption("f", "filelist",
60  							String.format("the file with the docs filenames (relative to input path), default [basedir]/%s", DEFAULT_DOCS_LIST), "FILE",
61  							CommandLine.Type.FILE_EXISTING, true, false, false)
62  					.withOption("o", "output",
63  							String.format("the output path where to save produced files, default [basedir]/%s", DEFAULT_NAF_DIR),
64  							"DIR", CommandLine.Type.DIRECTORY_EXISTING, true, false, false)
65  					.withOption("n", "namespace",
66  							String.format("the namespace for generating document URIs, default %s", DEFAULT_NAMESPACE),
67  							"NS", CommandLine.Type.STRING, true, false, false)
68  					.withOption("doc", "doc", "Check only one document", "URL", CommandLine.Type.STRING, true, false, false)
69  					.withLogger(LoggerFactory.getLogger("eu.fbk.fssa")).parse(args);
70  
71  			final File inputPath = cmd.getOptionValue("i", File.class);
72  
73  			File outputPath = new File(inputPath.getAbsolutePath() + File.separator + DEFAULT_NAF_DIR);
74  			if (cmd.hasOption("o")) {
75  				outputPath = cmd.getOptionValue("o", File.class);
76  			}
77  			if (!outputPath.exists()) {
78  				outputPath.mkdirs();
79  			}
80  
81  			File filelist = new File(inputPath.getAbsolutePath() + File.separator + DEFAULT_DOCS_LIST);
82  			if (cmd.hasOption("f")) {
83  				filelist = cmd.getOptionValue("f", File.class);
84  			}
85  
86  			String namespace = DEFAULT_NAMESPACE;
87  			if (cmd.hasOption("n")) {
88  				namespace = cmd.getOptionValue("n", String.class);
89  			}
90  
91  			String checkOneDoc = cmd.getOptionValue("doc", String.class);
92  
93  			preprocess(inputPath, outputPath, filelist, namespace, checkOneDoc);
94  
95  		} catch (final Throwable ex) {
96  			CommandLine.fail(ex);
97  		}
98  	}
99  
100 	public static final void preprocess(@Nullable final File inputPath,
101 										@Nullable final File outputPath, final File fileList, final String namespace,
102 										@Nullable final String checkOneDoc)
103 			throws IOException {
104 
105 		final List<String> filenames = Files.readLines(fileList, Charsets.UTF_8);
106 
107 		Writer tsvWriter = null;
108 		Writer htmlWriter = null;
109 
110 		try {
111 			tsvWriter = write(resolve(inputPath, DEFAULT_ANNOTATION_TSV));
112 			htmlWriter = write(resolve(inputPath, DEFAULT_ANNOTATION_HTML));
113 
114 			htmlWriter.write("<html>\n<head>\n<style type=\"text/css\">\n");
115 			htmlWriter.write(".counter { background-color: black; color: white; "
116 					+ "font-size: 80%; font-weight: bold; padding-left: 10px; "
117 					+ "padding-right: 10px; margin-right: 10px;}\n");
118 			htmlWriter.write(".pos { background-color: #95FF4F }\n");
119 			htmlWriter.write(".neg { background-color: #FF9797 }\n");
120 			htmlWriter.write(".source { color: black; font-weight: bold }\n");
121 			htmlWriter.write(".target { color: blue; font-weight: bold }\n");
122 			htmlWriter.write("</style>\n</head>\n<body>\n");
123 
124 			int fileCounter = 0;
125 			final AtomicInteger sentenceCounter = new AtomicInteger(0);
126 			for (final String filename : filenames) {
127 				LOGGER.info("Processing document {}/{}: {}", ++fileCounter, filenames.size(),
128 						filename);
129 
130 				final String name = filename.replace('/', '_');
131 				final String documentURI = namespace + name;
132 
133 				final RecordSet metadata = RecordSet.readFromFile(resolve(inputPath, "meta_anns/"
134 						+ filename));
135 				final RecordSet annotations = RecordSet.readFromFile(resolve(inputPath,
136 						"man_anns/" + filename + "/gateman.eu.fbk.dkm.pikes.resources.mpqa.lre.2.0"));
137 				final RecordSet sentences = RecordSet.readFromFile(resolve(inputPath, "man_anns/"
138 						+ filename + "/gatesentences.eu.fbk.dkm.pikes.resources.mpqa.2.0"));
139 				final String text = fixText(documentURI,
140 						readText(resolve(inputPath, "docs/" + filename)), sentences);
141 				final File nafFile = resolve(outputPath, name + ".naf");
142 
143 				if (checkOneDoc != null && !checkOneDoc.equals(documentURI)) {
144 					continue;
145 				}
146 
147 				if (!text.isEmpty() && !annotations.getRecords().isEmpty()) {
148 					emitNAF(documentURI, text, metadata, nafFile);
149 					emitAnnotations(documentURI, text, annotations, sentences, tsvWriter,
150 							htmlWriter, sentenceCounter);
151 				}
152 			}
153 
154 			htmlWriter.write("</body>\n</html>");
155 
156 		} finally {
157 			IO.closeQuietly(tsvWriter);
158 			IO.closeQuietly(htmlWriter);
159 		}
160 	}
161 
162 	private static void emitNAF(final String documentURI, final String text,
163 								final RecordSet metadata, final File nafFile) {
164 
165 		final String source = metadata.getRecordValue("meta_source", "-");
166 		final String description = metadata.getRecordValue("meta_description", "-");
167 		final String createTime = metadata.getRecordValue("meta_create_time", null);
168 		final String mediaFile = metadata.getRecordValue("meta_media_file", null);
169 		final String mediaType = metadata.getRecordValue("meta_media_type", null);
170 		final String title = metadata.getRecordValue("meta_title", "-");
171 		final String country = metadata.getRecordValue("meta_country", "-");
172 		final String topic = metadata.getRecordValue("meta_topic", "-").toLowerCase();
173 
174 		final KAFDocument document = new KAFDocument("en", "v3");
175 
176 		final StringBuilder builder = new StringBuilder();
177 		int index = 0;
178 		for (; index < text.length(); ++index) {
179 			if (Character.isWhitespace(text.charAt(index))) {
180 				builder.append("&nbsp;");
181 			}
182 			else {
183 				break;
184 			}
185 		}
186 		builder.append(text.substring(index));
187 
188 		document.setRawText(builder.toString());
189 
190 		document.createPublic();
191 		document.getPublic().publicId = Statements.VALUE_FACTORY.createIRI(documentURI).getLocalName();
192 		document.getPublic().uri = documentURI;
193 
194 		document.createFileDesc();
195 		document.getFileDesc().author = source + " / " + description;
196 		document.getFileDesc().creationtime = createTime;
197 		document.getFileDesc().filename = mediaFile;
198 		document.getFileDesc().filetype = mediaType;
199 		document.getFileDesc().title = title + " (" + topic + " / " + country + ")";
200 
201 		document.save(nafFile.getAbsolutePath());
202 	}
203 
204 	private static void emitAnnotations(final String documentURI, final String text,
205 										final RecordSet annotations, final RecordSet sentences, final Writer tsvWriter,
206 										final Writer htmlWriter, final AtomicInteger counter) throws IOException {
207 
208 		// Agents
209 		HashMap<String, Record> agentRecords = new HashMap<>();
210 		HashMultimap<String, Record> lastRecords = HashMultimap.create();
211 
212 		for (final Record agentRecord : annotations.getRecords("GATE_agent")) {
213 			String sourceString = agentRecord.getValue("nested-source");
214 			if (sourceString != null) {
215 				List<String> sources = parseList(sourceString);
216 				if (sources.size() > 0) {
217 					String last = sources.get(sources.size() - 1);
218 					lastRecords.put(last, agentRecord);
219 				}
220 			}
221 
222 			String id = agentRecord.getValue("id");
223 			if (id == null) {
224 				continue;
225 			}
226 			agentRecords.put(id, agentRecord);
227 		}
228 
229 		// Attitudes
230 		for (final Record thisRecord : annotations.getRecords("GATE_attitude")) {
231 
232 			final Multimap<Span, String> highlights = HashMultimap.create();
233 			final Multimap<String, String> fields = HashMultimap.create();
234 
235 			fields.put("type", "attitude");
236 			String id = thisRecord.getValue("id");
237 			if (id != null) {
238 				fields.put("id", id);
239 			}
240 
241 			final Set<String> otherSources = Sets.newHashSet();
242 			final Set<String> otherTargets = Sets.newHashSet();
243 
244 			// Extract sentiment value. Skip if absent
245 			String sentiment = thisRecord.getValue("attitude-type");
246 			if (sentiment == null || !sentiment.startsWith("sentiment-")) {
247 				continue;
248 			}
249 			sentiment = sentiment.substring("sentiment-".length());
250 			fields.put("sentiment", sentiment);
251 
252 			// Extract attitude intensity and span
253 			final Span expressionSpan = thisRecord.getSpan().align(text);
254 			fields.put("expression", expressionSpan.toString());
255 			fields.put("intensity", thisRecord.getValue("intensity"));
256 			highlights.put(expressionSpan, "pos".equals(sentiment) ? "pos" : "neg");
257 
258 			// Extract sentence
259 			Span sentenceSpan = getSentenceSpan(thisRecord, sentences, fields, text, documentURI);
260 			if (sentenceSpan == null) {
261 				continue;
262 			}
263 
264 			// Extract target span
265 			final String targetID = thisRecord.getValue("target-link");
266 			if (targetID != null) {
267 				final Record targetRecord = annotations.getRecord("GATE_target", "id", targetID);
268 				if (targetRecord != null) {
269 					final Span span = targetRecord.getSpan().align(text);
270 					span.check(text, documentURI);
271 					fields.put("target", span.toString());
272 					if (sentenceSpan.contains(span)) {
273 						highlights.put(span, "target");
274 					}
275 					else {
276 						if (sentenceSpan.overlaps(span)) {
277 							LOGGER.warn("Target span " + span
278 									+ " only overlapping with sentence span " + sentenceSpan
279 									+ " in " + documentURI);
280 						}
281 						otherTargets.add(span.apply(text));
282 					}
283 				}
284 			}
285 
286 			// Extract dsa attributes
287 			final String attitudeID = thisRecord.getValue("id");
288 			if (attitudeID != null) {
289 				final Record dsaRecord = annotations.getRecord("GATE_direct-subjective", "attitude-link", attitudeID);
290 				if (dsaRecord != null) {
291 					for (final String name : DSA_FIELDS) {
292 						String value = dsaRecord.getValue(name);
293 						if (value != null) {
294 							fields.put("dsa-" + name, value);
295 						}
296 					}
297 
298 					final String nestedSource = dsaRecord.getValue("nested-source");
299 					addSources(nestedSource, agentRecords, lastRecords, sentenceSpan, fields, documentURI, text);
300 				}
301 			}
302 
303 			// Emit TSV record
304 			tsvWriter.append(getTsvString(documentURI, fields));
305 
306 			// Print debug
307 			LOGGER.debug(fields.get("type").toString());
308 			LOGGER.debug(fields.toString());
309 			for (String expression : fields.get("expression")) {
310 				LOGGER.debug(expression);
311 				Span span = new Span(expression);
312 				LOGGER.debug(span.apply(text));
313 			}
314 			System.out.println();
315 
316 			// Emit HTML sentence
317 			htmlWriter.append("<p>");
318 			htmlWriter.append("<span class=\"counter\" title=\"");
319 			htmlWriter.append("document: ").append(HtmlEscapers.htmlEscaper().escape(documentURI))
320 					.append(NEWLINE);
321 			if (!otherSources.isEmpty()) {
322 				htmlWriter.append("other sources: ").append(Joiner.on(" | ").join(otherSources))
323 						.append(NEWLINE);
324 			}
325 			if (!otherTargets.isEmpty()) {
326 				htmlWriter.append("other targets: ").append(Joiner.on(" | ").join(otherTargets))
327 						.append(NEWLINE);
328 			}
329 			for (final String name : TSV_FIELDS) {
330 				final List<String> values = Lists.newArrayList();
331 				for (final String value : fields.get(name)) {
332 					if (value != null) {
333 						values.add(value);
334 					}
335 				}
336 				if (!values.isEmpty()) {
337 					htmlWriter.append(name).append(": ").append(Joiner.on(" | ").join(values))
338 							.append(NEWLINE);
339 				}
340 			}
341 			htmlWriter.append("\">" + counter.incrementAndGet() + "</span> ");
342 			final List<Span> spans = sentenceSpan.split(highlights.keySet());
343 			for (final Span span : spans) {
344 				final Set<String> cssClasses = Sets.newHashSet();
345 				for (final Map.Entry<Span, String> entry : highlights.entries()) {
346 					if (entry.getKey().contains(span)) {
347 						cssClasses.add(entry.getValue());
348 					}
349 				}
350 				if (!cssClasses.isEmpty()) {
351 					htmlWriter.append("<span class=\"").append(Joiner.on(" ").join(cssClasses))
352 							.append("\">");
353 				}
354 				htmlWriter.append(span.apply(text));
355 				if (!cssClasses.isEmpty()) {
356 					htmlWriter.append("</span>");
357 				}
358 			}
359 			htmlWriter.append("</p>\n\n");
360 		}
361 
362 		// Objective
363 		for (final Record thisRecord : annotations.getRecords("GATE_objective-speech-event")) {
364 
365 			final Multimap<String, String> fields = HashMultimap.create();
366 			fields.put("type", "objective");
367 			String id = thisRecord.getValue("id");
368 			if (id != null) {
369 				fields.put("id", id);
370 			}
371 
372 			final Span expressionSpan = thisRecord.getSpan().align(text);
373 			fields.put("expression", expressionSpan.toString());
374 
375 			// Extract sentence
376 			Span sentenceSpan = getSentenceSpan(thisRecord, sentences, fields, text, documentURI);
377 			if (sentenceSpan == null) {
378 				continue;
379 			}
380 
381 			// Holder
382 			String sources = thisRecord.getValue("nested-source");
383 			addSources(sources, agentRecords, lastRecords, sentenceSpan, fields, documentURI, text);
384 
385 			// Emit TSV string
386 			tsvWriter.append(getTsvString(documentURI, fields));
387 
388 			// Print debug
389 			LOGGER.debug(fields.get("type").toString());
390 			LOGGER.debug(fields.toString());
391 			for (String expression : fields.get("expression")) {
392 				LOGGER.debug(expression);
393 				Span span = new Span(expression);
394 				LOGGER.debug(span.apply(text));
395 			}
396 			System.out.println();
397 		}
398 
399 		// Expressive
400 		for (final Record thisRecord : annotations.getRecords("GATE_expressive-subjectivity")) {
401 			final Multimap<String, String> fields = HashMultimap.create();
402 			fields.put("type", "expressive");
403 			String id = thisRecord.getValue("id");
404 			if (id != null) {
405 				fields.put("id", id);
406 			}
407 
408 			final Span expressionSpan = thisRecord.getSpan().align(text);
409 			fields.put("expression", expressionSpan.toString());
410 
411 			// Sentence
412 			Span sentenceSpan = getSentenceSpan(thisRecord, sentences, fields, text, documentURI);
413 			if (sentenceSpan == null) {
414 				continue;
415 			}
416 
417 			for (final String name : DSA_FIELDS) {
418 				String value = thisRecord.getValue(name);
419 				if (value != null) {
420 					fields.put("dsa-" + name, value);
421 				}
422 			}
423 
424 			// Holder
425 			String sources = thisRecord.getValue("nested-source");
426 			addSources(sources, agentRecords, lastRecords, sentenceSpan, fields, documentURI, text);
427 
428 			// Emit TSV string
429 			tsvWriter.append(getTsvString(documentURI, fields));
430 
431 			// Print debug
432 			LOGGER.debug(fields.get("type").toString());
433 			LOGGER.debug(fields.toString());
434 			for (String expression : fields.get("expression")) {
435 				LOGGER.debug(expression);
436 				Span span = new Span(expression);
437 				LOGGER.debug(span.apply(text));
438 			}
439 			System.out.println();
440 		}
441 
442 		// Subjective
443 		for (final Record thisRecord : annotations.getRecords("GATE_direct-subjective")) {
444 
445 			final Multimap<String, String> fields = HashMultimap.create();
446 			fields.put("type", "subjective");
447 			String id = thisRecord.getValue("id");
448 			if (id != null) {
449 				fields.put("id", id);
450 			}
451 
452 			final Span expressionSpan = thisRecord.getSpan().align(text);
453 			fields.put("expression", expressionSpan.toString());
454 
455 			// Sentence
456 			Span sentenceSpan = getSentenceSpan(thisRecord, sentences, fields, text, documentURI);
457 			if (sentenceSpan == null) {
458 				continue;
459 			}
460 
461 			for (final String name : DSA_FIELDS) {
462 				String value = thisRecord.getValue(name);
463 				if (value != null) {
464 					fields.put("dsa-" + name, value);
465 				}
466 			}
467 
468 			// Holder
469 			String sources = thisRecord.getValue("nested-source");
470 			addSources(sources, agentRecords, lastRecords, sentenceSpan, fields, documentURI, text);
471 
472 			// Emit TSV string
473 			tsvWriter.append(getTsvString(documentURI, fields));
474 
475 			// Print debug
476 			LOGGER.debug(fields.get("type").toString());
477 			LOGGER.debug(fields.toString());
478 			for (String expression : fields.get("expression")) {
479 				LOGGER.debug(expression);
480 				Span span = new Span(expression);
481 				LOGGER.debug(span.apply(text));
482 			}
483 			System.out.println();
484 		}
485 	}
486 
487 	private static Span getSentenceSpan(Record record, RecordSet sentences, Multimap<String, String> fields, String text, String documentURI) {
488 
489 		Span ret = null;
490 		Span okSpan = record.getSpan().align(text);
491 
492 		if (okSpan.end == 0 || okSpan.begin == okSpan.end) {
493 			return ret;
494 		}
495 
496 		for (final Record sentenceRecord : sentences.getRecords()) {
497 			final Span span = sentenceRecord.getSpan();
498 			if (span.contains(okSpan)) {
499 				ret = span;
500 				fields.put("sentence", okSpan.toString());
501 				break;
502 			}
503 		}
504 
505 		if (ret == null) {
506 			LOGGER.warn("Could not locate sentence for span {} in {}", okSpan.toString(), documentURI);
507 		}
508 
509 		return ret;
510 	}
511 
512 	private static void addSources(String nestedSource, HashMap<String, Record> agentRecords,
513 								   HashMultimap<String, Record> lastRecords, Span sentenceSpan,
514 								   Multimap<String, String> fields, String documentURI, String text) {
515 		addSources(nestedSource, agentRecords, lastRecords, sentenceSpan, fields, documentURI, text, null, null);
516 	}
517 
518 	private static void addSources(String nestedSource, HashMap<String, Record> agentRecords,
519 								   HashMultimap<String, Record> lastRecords, Span sentenceSpan,
520 								   Multimap<String, String> fields, String documentURI, String text,
521 								   @Nullable Multimap<Span, String> highlights,
522 								   @Nullable Set<String> otherSources) {
523 		if (nestedSource != null) {
524 			List<String> sources = parseList(nestedSource);
525 
526 			if (sources.size() > 0) {
527 				String last = sources.get(sources.size() - 1);
528 
529 				addSourceFromRecord(agentRecords.get(last), sentenceSpan, fields, documentURI, text, highlights, otherSources);
530 				for (Record record : lastRecords.get(last)) {
531 					addSourceFromRecord(record, sentenceSpan, fields, documentURI, text, highlights, otherSources);
532 				}
533 			}
534 		}
535 
536 	}
537 
538 	private static void addSourceFromRecord(Record record, Span sentenceSpan, Multimap<String, String> fields,
539 											String documentURI, String text) {
540 		addSourceFromRecord(record, sentenceSpan, fields, documentURI, text, null, null);
541 	}
542 
543 	private static void addSourceFromRecord(Record record, Span sentenceSpan, Multimap<String, String> fields,
544 											String documentURI, String text,
545 											@Nullable Multimap<Span, String> highlights,
546 											@Nullable Set<String> otherSources) {
547 		if (record == null) {
548 			return;
549 		}
550 
551 		final Span span = record.getSpan().align(text);
552 		if (span.end == 0) {
553 			return;
554 		}
555 
556 		span.check(text, documentURI);
557 		fields.put("source", span.toString());
558 		if (sentenceSpan.contains(span)) {
559 			fields.put("source-local", span.toString());
560 			if (highlights != null) {
561 				highlights.put(span, "source");
562 			}
563 		}
564 		else {
565 			if (sentenceSpan.overlaps(span)) {
566 				LOGGER.warn("Source span " + span
567 						+ " only overlapping with sentence span "
568 						+ sentenceSpan + " in " + documentURI);
569 			}
570 			if (otherSources != null) {
571 				otherSources.add(span.apply(text));
572 			}
573 		}
574 
575 	}
576 
577 	private static List<String> parseList(String sourceString) {
578 		List<String> ret = new ArrayList<>();
579 
580 		String[] parts = sourceString.split(",");
581 		for (String part : parts) {
582 			part = part.trim();
583 			if (part.length() > 0) {
584 				ret.add(part);
585 			}
586 		}
587 
588 		return ret;
589 	}
590 
591 	private static CharSequence getTsvString(String documentURI, Multimap<String, String> fields) {
592 		StringBuilder ret = new StringBuilder();
593 
594 		ret.append("document=").append(documentURI);
595 		for (final String name : TSV_FIELDS) {
596 			final List<String> values = Lists.newArrayList();
597 			for (final String value : fields.get(name)) {
598 				if (value != null) {
599 					values.add(value);
600 				}
601 			}
602 			if (!values.isEmpty()) {
603 				ret.append("\t").append(name).append("=")
604 						.append(Joiner.on("|").join(values).replace('\t', ' '));
605 			}
606 		}
607 		ret.append("\n");
608 
609 		return ret.toString();
610 	}
611 
612 	private static String readText(@Nullable final File file) throws IOException {
613 		if (file == null || !file.exists()) {
614 			return "";
615 		}
616 		try (Reader reader = RecordSet.read(file)) {
617 			return CharStreams.toString(reader);
618 		}
619 	}
620 
621 	private static Writer write(final File file) throws IOException {
622 		Files.createParentDirs(file);
623 		return new BufferedWriter(new OutputStreamWriter(new FileOutputStream(file),
624 				Charsets.UTF_8));
625 	}
626 
627 	private static File resolve(@Nullable final File base, final String name) {
628 		final File actualBase = base != null ? base : new File(System.getProperty("user.dir"));
629 		return actualBase.toPath().resolve(Paths.get(name)).toFile();
630 	}
631 
632 	private static String fixText(final String documentURI, String text, final RecordSet sentences) {
633 
634 		// For six documents, offsets in the MPQA annotation files are not aligned with the
635 		// text. The following code fixes the issue by reshaping the text so that it becomes
636 		// properly aligned with the offset
637 		if (documentURI.endsWith("xbank_wsj_0583")) {
638 			text = text.substring(0, 2263) + "   " + text.substring(2263);
639 		}
640 		else if (documentURI.endsWith("ula_IZ-060316-01-Trans-1")) {
641 			text = text.substring(0, 10174) + text.substring(10176);
642 		}
643 		else if (documentURI.endsWith("ula_AFGP-2002-600175-Trans")) {
644 			text = text.substring(0, 7903) + " " + text.substring(7906);
645 		}
646 		else if (documentURI.endsWith("ula_chapter-10")) {
647 			text = text.substring(0, 46929) + text.substring(46932);
648 		}
649 		else if (documentURI.endsWith("ula_AFGP-2002-600002-Trans")) {
650 			text = text.substring(0, 9902) + text.substring(9905, 9938) + text.substring(9941);
651 		}
652 
653 		final List<Span> sentenceSpans = Lists.newArrayList();
654 		for (final Record sentenceRecord : sentences.getRecords()) {
655 			sentenceSpans.add(sentenceRecord.getSpan());
656 		}
657 		Collections.sort(sentenceSpans);
658 
659 		// Remove <tag> markup (not much) and all newlines
660 		final StringBuilder builder = new StringBuilder(text);
661 		boolean insideTag = false;
662 		for (int i = 0; i < builder.length(); ++i) {
663 			final char c = builder.charAt(i);
664 			if (c == '<') {
665 				insideTag = true;
666 				builder.setCharAt(i, ' ');
667 			}
668 			else if (c == '>') {
669 				insideTag = false;
670 				builder.setCharAt(i, ' ');
671 			}
672 			else if (insideTag || c == '\n' || c == '\r' || c == '\t') {
673 				builder.setCharAt(i, ' ');
674 			}
675 		}
676 
677 		for (int i = 0; i < sentenceSpans.size() - 1; ++i) {
678 			final Span first = sentenceSpans.get(i);
679 			final Span next = sentenceSpans.get(i + 1);
680 
681 			// Check sentence boundary, logging a warning if it seems wrong
682 			if (next.begin >= first.end) {
683 				boolean allAlpha = true;
684 				for (int j = first.end - 1; j <= next.begin; ++j) {
685 					allAlpha = allAlpha && Character.isLetterOrDigit(builder.charAt(j));
686 				}
687 				if (allAlpha) {
688 					LOGGER.warn("Boundary between " + first + " and " + next
689 							+ " could be wrong in " + documentURI + " ("
690 							+ text.substring(first.end - 1, next.begin + 1) + ")");
691 				}
692 			}
693 
694 			// Erase text between annotated sentences
695 			for (int j = first.end; j < next.begin; ++j) {
696 				builder.setCharAt(j, ' ');
697 			}
698 
699 			// Add newlines between sentences so to pass gold sentence splitting to Stanford
700 			if (next.begin > first.end) {
701 				builder.setCharAt(next.begin - 1, '\n');
702 			}
703 			else {
704 				builder.setCharAt(isDelim(builder.charAt(next.begin)) ? next.begin
705 						: next.begin - 1, '\n');
706 			}
707 		}
708 
709 		return builder.toString();
710 		// return text;
711 	}
712 
713 	public static boolean isWord(final char c) {
714 		return " \t\n\r,;:!?".indexOf(c) < 0;
715 	}
716 
717 	public static boolean isDelim(final char c) {
718 		return " \t\n\r,;:!?.()[]<>~`'\"-".indexOf(c) >= 0;
719 	}
720 
721 }