1 package eu.fbk.dkm.pikes.resources.mpqa;
2
3 import com.google.common.base.Charsets;
4 import com.google.common.base.Joiner;
5 import com.google.common.collect.HashMultimap;
6 import com.google.common.collect.Lists;
7 import com.google.common.collect.Multimap;
8 import com.google.common.collect.Sets;
9 import com.google.common.html.HtmlEscapers;
10 import com.google.common.io.CharStreams;
11 import com.google.common.io.Files;
12 import eu.fbk.rdfpro.util.Statements;
13 import eu.fbk.utils.core.CommandLine;
14 import eu.fbk.rdfpro.util.IO;
15 import ixa.kaflib.KAFDocument;
16 import org.slf4j.Logger;
17 import org.slf4j.LoggerFactory;
18
19 import javax.annotation.Nullable;
20 import javax.xml.stream.XMLStreamException;
21 import java.io.*;
22 import java.nio.file.Paths;
23 import java.util.*;
24 import java.util.concurrent.atomic.AtomicInteger;
25
26 public class CorpusPreprocessor {
27
28 private static final Logger LOGGER = LoggerFactory.getLogger(CorpusPreprocessor.class);
29
30 private static final String NEWLINE = " ";
31 private static final String DEFAULT_DOCS_LIST = "doclist.all";
32 private static final String DEFAULT_NAF_DIR = "NAF";
33 public static final String DEFAULT_NAMESPACE = "http://eu.fbk.dkm.pikes.resources.mpqa.cs.pitt.edu/corpora/mpqa_corpus/";
34 public static final String DEFAULT_ANNOTATION_TSV = "annotations.tsv";
35 public static final String DEFAULT_ANNOTATION_HTML = "annotations.html";
36
37 private static final String[] DSA_FIELDS = new String[]{"implicit", "insubstantial",
38 "polarity", "intensity", "expression-intensity", "annotation-uncertain",
39 "subjective-uncertain"};
40
41 private static final String[] TSV_FIELDS = new String[]{"sentiment", "intensity",
42 "attitude", "target", "source", "source-local", "sentence", "dsa-implicit",
43 "dsa-insubstantial", "dsa-polarity", "dsa-intensity", "dsa-expression-intensity",
44 "dsa-annotation-uncertain", "dsa-subjective-uncertain", "type", "id", "expression"};
45
46 private static final String[] MULTI_FIELDS = new String[]{"nested-source", "attitude-link"};
47
48 public static void main(final String[] args) throws IOException, XMLStreamException {
49 try {
50 final CommandLine cmd = CommandLine
51 .parser()
52 .withName("corpus-preprocessor")
53 .withHeader(
54 "Produces NAF files, a TSV file with sentiment annotations "
55 + "and an HTML file with annotated sentences "
56 + "starting from the MPQA v.2 corpus")
57 .withOption("i", "input-path", "the base path of the MPQA corpus", "DIR",
58 CommandLine.Type.DIRECTORY_EXISTING, true, false, true)
59 .withOption("f", "filelist",
60 String.format("the file with the docs filenames (relative to input path), default [basedir]/%s", DEFAULT_DOCS_LIST), "FILE",
61 CommandLine.Type.FILE_EXISTING, true, false, false)
62 .withOption("o", "output",
63 String.format("the output path where to save produced files, default [basedir]/%s", DEFAULT_NAF_DIR),
64 "DIR", CommandLine.Type.DIRECTORY_EXISTING, true, false, false)
65 .withOption("n", "namespace",
66 String.format("the namespace for generating document URIs, default %s", DEFAULT_NAMESPACE),
67 "NS", CommandLine.Type.STRING, true, false, false)
68 .withOption("doc", "doc", "Check only one document", "URL", CommandLine.Type.STRING, true, false, false)
69 .withLogger(LoggerFactory.getLogger("eu.fbk.fssa")).parse(args);
70
71 final File inputPath = cmd.getOptionValue("i", File.class);
72
73 File outputPath = new File(inputPath.getAbsolutePath() + File.separator + DEFAULT_NAF_DIR);
74 if (cmd.hasOption("o")) {
75 outputPath = cmd.getOptionValue("o", File.class);
76 }
77 if (!outputPath.exists()) {
78 outputPath.mkdirs();
79 }
80
81 File filelist = new File(inputPath.getAbsolutePath() + File.separator + DEFAULT_DOCS_LIST);
82 if (cmd.hasOption("f")) {
83 filelist = cmd.getOptionValue("f", File.class);
84 }
85
86 String namespace = DEFAULT_NAMESPACE;
87 if (cmd.hasOption("n")) {
88 namespace = cmd.getOptionValue("n", String.class);
89 }
90
91 String checkOneDoc = cmd.getOptionValue("doc", String.class);
92
93 preprocess(inputPath, outputPath, filelist, namespace, checkOneDoc);
94
95 } catch (final Throwable ex) {
96 CommandLine.fail(ex);
97 }
98 }
99
100 public static final void preprocess(@Nullable final File inputPath,
101 @Nullable final File outputPath, final File fileList, final String namespace,
102 @Nullable final String checkOneDoc)
103 throws IOException {
104
105 final List<String> filenames = Files.readLines(fileList, Charsets.UTF_8);
106
107 Writer tsvWriter = null;
108 Writer htmlWriter = null;
109
110 try {
111 tsvWriter = write(resolve(inputPath, DEFAULT_ANNOTATION_TSV));
112 htmlWriter = write(resolve(inputPath, DEFAULT_ANNOTATION_HTML));
113
114 htmlWriter.write("<html>\n<head>\n<style type=\"text/css\">\n");
115 htmlWriter.write(".counter { background-color: black; color: white; "
116 + "font-size: 80%; font-weight: bold; padding-left: 10px; "
117 + "padding-right: 10px; margin-right: 10px;}\n");
118 htmlWriter.write(".pos { background-color: #95FF4F }\n");
119 htmlWriter.write(".neg { background-color: #FF9797 }\n");
120 htmlWriter.write(".source { color: black; font-weight: bold }\n");
121 htmlWriter.write(".target { color: blue; font-weight: bold }\n");
122 htmlWriter.write("</style>\n</head>\n<body>\n");
123
124 int fileCounter = 0;
125 final AtomicInteger sentenceCounter = new AtomicInteger(0);
126 for (final String filename : filenames) {
127 LOGGER.info("Processing document {}/{}: {}", ++fileCounter, filenames.size(),
128 filename);
129
130 final String name = filename.replace('/', '_');
131 final String documentURI = namespace + name;
132
133 final RecordSet metadata = RecordSet.readFromFile(resolve(inputPath, "meta_anns/"
134 + filename));
135 final RecordSet annotations = RecordSet.readFromFile(resolve(inputPath,
136 "man_anns/" + filename + "/gateman.eu.fbk.dkm.pikes.resources.mpqa.lre.2.0"));
137 final RecordSet sentences = RecordSet.readFromFile(resolve(inputPath, "man_anns/"
138 + filename + "/gatesentences.eu.fbk.dkm.pikes.resources.mpqa.2.0"));
139 final String text = fixText(documentURI,
140 readText(resolve(inputPath, "docs/" + filename)), sentences);
141 final File nafFile = resolve(outputPath, name + ".naf");
142
143 if (checkOneDoc != null && !checkOneDoc.equals(documentURI)) {
144 continue;
145 }
146
147 if (!text.isEmpty() && !annotations.getRecords().isEmpty()) {
148 emitNAF(documentURI, text, metadata, nafFile);
149 emitAnnotations(documentURI, text, annotations, sentences, tsvWriter,
150 htmlWriter, sentenceCounter);
151 }
152 }
153
154 htmlWriter.write("</body>\n</html>");
155
156 } finally {
157 IO.closeQuietly(tsvWriter);
158 IO.closeQuietly(htmlWriter);
159 }
160 }
161
162 private static void emitNAF(final String documentURI, final String text,
163 final RecordSet metadata, final File nafFile) {
164
165 final String source = metadata.getRecordValue("meta_source", "-");
166 final String description = metadata.getRecordValue("meta_description", "-");
167 final String createTime = metadata.getRecordValue("meta_create_time", null);
168 final String mediaFile = metadata.getRecordValue("meta_media_file", null);
169 final String mediaType = metadata.getRecordValue("meta_media_type", null);
170 final String title = metadata.getRecordValue("meta_title", "-");
171 final String country = metadata.getRecordValue("meta_country", "-");
172 final String topic = metadata.getRecordValue("meta_topic", "-").toLowerCase();
173
174 final KAFDocument document = new KAFDocument("en", "v3");
175
176 final StringBuilder builder = new StringBuilder();
177 int index = 0;
178 for (; index < text.length(); ++index) {
179 if (Character.isWhitespace(text.charAt(index))) {
180 builder.append(" ");
181 }
182 else {
183 break;
184 }
185 }
186 builder.append(text.substring(index));
187
188 document.setRawText(builder.toString());
189
190 document.createPublic();
191 document.getPublic().publicId = Statements.VALUE_FACTORY.createIRI(documentURI).getLocalName();
192 document.getPublic().uri = documentURI;
193
194 document.createFileDesc();
195 document.getFileDesc().author = source + " / " + description;
196 document.getFileDesc().creationtime = createTime;
197 document.getFileDesc().filename = mediaFile;
198 document.getFileDesc().filetype = mediaType;
199 document.getFileDesc().title = title + " (" + topic + " / " + country + ")";
200
201 document.save(nafFile.getAbsolutePath());
202 }
203
204 private static void emitAnnotations(final String documentURI, final String text,
205 final RecordSet annotations, final RecordSet sentences, final Writer tsvWriter,
206 final Writer htmlWriter, final AtomicInteger counter) throws IOException {
207
208
209 HashMap<String, Record> agentRecords = new HashMap<>();
210 HashMultimap<String, Record> lastRecords = HashMultimap.create();
211
212 for (final Record agentRecord : annotations.getRecords("GATE_agent")) {
213 String sourceString = agentRecord.getValue("nested-source");
214 if (sourceString != null) {
215 List<String> sources = parseList(sourceString);
216 if (sources.size() > 0) {
217 String last = sources.get(sources.size() - 1);
218 lastRecords.put(last, agentRecord);
219 }
220 }
221
222 String id = agentRecord.getValue("id");
223 if (id == null) {
224 continue;
225 }
226 agentRecords.put(id, agentRecord);
227 }
228
229
230 for (final Record thisRecord : annotations.getRecords("GATE_attitude")) {
231
232 final Multimap<Span, String> highlights = HashMultimap.create();
233 final Multimap<String, String> fields = HashMultimap.create();
234
235 fields.put("type", "attitude");
236 String id = thisRecord.getValue("id");
237 if (id != null) {
238 fields.put("id", id);
239 }
240
241 final Set<String> otherSources = Sets.newHashSet();
242 final Set<String> otherTargets = Sets.newHashSet();
243
244
245 String sentiment = thisRecord.getValue("attitude-type");
246 if (sentiment == null || !sentiment.startsWith("sentiment-")) {
247 continue;
248 }
249 sentiment = sentiment.substring("sentiment-".length());
250 fields.put("sentiment", sentiment);
251
252
253 final Span expressionSpan = thisRecord.getSpan().align(text);
254 fields.put("expression", expressionSpan.toString());
255 fields.put("intensity", thisRecord.getValue("intensity"));
256 highlights.put(expressionSpan, "pos".equals(sentiment) ? "pos" : "neg");
257
258
259 Span sentenceSpan = getSentenceSpan(thisRecord, sentences, fields, text, documentURI);
260 if (sentenceSpan == null) {
261 continue;
262 }
263
264
265 final String targetID = thisRecord.getValue("target-link");
266 if (targetID != null) {
267 final Record targetRecord = annotations.getRecord("GATE_target", "id", targetID);
268 if (targetRecord != null) {
269 final Span span = targetRecord.getSpan().align(text);
270 span.check(text, documentURI);
271 fields.put("target", span.toString());
272 if (sentenceSpan.contains(span)) {
273 highlights.put(span, "target");
274 }
275 else {
276 if (sentenceSpan.overlaps(span)) {
277 LOGGER.warn("Target span " + span
278 + " only overlapping with sentence span " + sentenceSpan
279 + " in " + documentURI);
280 }
281 otherTargets.add(span.apply(text));
282 }
283 }
284 }
285
286
287 final String attitudeID = thisRecord.getValue("id");
288 if (attitudeID != null) {
289 final Record dsaRecord = annotations.getRecord("GATE_direct-subjective", "attitude-link", attitudeID);
290 if (dsaRecord != null) {
291 for (final String name : DSA_FIELDS) {
292 String value = dsaRecord.getValue(name);
293 if (value != null) {
294 fields.put("dsa-" + name, value);
295 }
296 }
297
298 final String nestedSource = dsaRecord.getValue("nested-source");
299 addSources(nestedSource, agentRecords, lastRecords, sentenceSpan, fields, documentURI, text);
300 }
301 }
302
303
304 tsvWriter.append(getTsvString(documentURI, fields));
305
306
307 LOGGER.debug(fields.get("type").toString());
308 LOGGER.debug(fields.toString());
309 for (String expression : fields.get("expression")) {
310 LOGGER.debug(expression);
311 Span span = new Span(expression);
312 LOGGER.debug(span.apply(text));
313 }
314 System.out.println();
315
316
317 htmlWriter.append("<p>");
318 htmlWriter.append("<span class=\"counter\" title=\"");
319 htmlWriter.append("document: ").append(HtmlEscapers.htmlEscaper().escape(documentURI))
320 .append(NEWLINE);
321 if (!otherSources.isEmpty()) {
322 htmlWriter.append("other sources: ").append(Joiner.on(" | ").join(otherSources))
323 .append(NEWLINE);
324 }
325 if (!otherTargets.isEmpty()) {
326 htmlWriter.append("other targets: ").append(Joiner.on(" | ").join(otherTargets))
327 .append(NEWLINE);
328 }
329 for (final String name : TSV_FIELDS) {
330 final List<String> values = Lists.newArrayList();
331 for (final String value : fields.get(name)) {
332 if (value != null) {
333 values.add(value);
334 }
335 }
336 if (!values.isEmpty()) {
337 htmlWriter.append(name).append(": ").append(Joiner.on(" | ").join(values))
338 .append(NEWLINE);
339 }
340 }
341 htmlWriter.append("\">" + counter.incrementAndGet() + "</span> ");
342 final List<Span> spans = sentenceSpan.split(highlights.keySet());
343 for (final Span span : spans) {
344 final Set<String> cssClasses = Sets.newHashSet();
345 for (final Map.Entry<Span, String> entry : highlights.entries()) {
346 if (entry.getKey().contains(span)) {
347 cssClasses.add(entry.getValue());
348 }
349 }
350 if (!cssClasses.isEmpty()) {
351 htmlWriter.append("<span class=\"").append(Joiner.on(" ").join(cssClasses))
352 .append("\">");
353 }
354 htmlWriter.append(span.apply(text));
355 if (!cssClasses.isEmpty()) {
356 htmlWriter.append("</span>");
357 }
358 }
359 htmlWriter.append("</p>\n\n");
360 }
361
362
363 for (final Record thisRecord : annotations.getRecords("GATE_objective-speech-event")) {
364
365 final Multimap<String, String> fields = HashMultimap.create();
366 fields.put("type", "objective");
367 String id = thisRecord.getValue("id");
368 if (id != null) {
369 fields.put("id", id);
370 }
371
372 final Span expressionSpan = thisRecord.getSpan().align(text);
373 fields.put("expression", expressionSpan.toString());
374
375
376 Span sentenceSpan = getSentenceSpan(thisRecord, sentences, fields, text, documentURI);
377 if (sentenceSpan == null) {
378 continue;
379 }
380
381
382 String sources = thisRecord.getValue("nested-source");
383 addSources(sources, agentRecords, lastRecords, sentenceSpan, fields, documentURI, text);
384
385
386 tsvWriter.append(getTsvString(documentURI, fields));
387
388
389 LOGGER.debug(fields.get("type").toString());
390 LOGGER.debug(fields.toString());
391 for (String expression : fields.get("expression")) {
392 LOGGER.debug(expression);
393 Span span = new Span(expression);
394 LOGGER.debug(span.apply(text));
395 }
396 System.out.println();
397 }
398
399
400 for (final Record thisRecord : annotations.getRecords("GATE_expressive-subjectivity")) {
401 final Multimap<String, String> fields = HashMultimap.create();
402 fields.put("type", "expressive");
403 String id = thisRecord.getValue("id");
404 if (id != null) {
405 fields.put("id", id);
406 }
407
408 final Span expressionSpan = thisRecord.getSpan().align(text);
409 fields.put("expression", expressionSpan.toString());
410
411
412 Span sentenceSpan = getSentenceSpan(thisRecord, sentences, fields, text, documentURI);
413 if (sentenceSpan == null) {
414 continue;
415 }
416
417 for (final String name : DSA_FIELDS) {
418 String value = thisRecord.getValue(name);
419 if (value != null) {
420 fields.put("dsa-" + name, value);
421 }
422 }
423
424
425 String sources = thisRecord.getValue("nested-source");
426 addSources(sources, agentRecords, lastRecords, sentenceSpan, fields, documentURI, text);
427
428
429 tsvWriter.append(getTsvString(documentURI, fields));
430
431
432 LOGGER.debug(fields.get("type").toString());
433 LOGGER.debug(fields.toString());
434 for (String expression : fields.get("expression")) {
435 LOGGER.debug(expression);
436 Span span = new Span(expression);
437 LOGGER.debug(span.apply(text));
438 }
439 System.out.println();
440 }
441
442
443 for (final Record thisRecord : annotations.getRecords("GATE_direct-subjective")) {
444
445 final Multimap<String, String> fields = HashMultimap.create();
446 fields.put("type", "subjective");
447 String id = thisRecord.getValue("id");
448 if (id != null) {
449 fields.put("id", id);
450 }
451
452 final Span expressionSpan = thisRecord.getSpan().align(text);
453 fields.put("expression", expressionSpan.toString());
454
455
456 Span sentenceSpan = getSentenceSpan(thisRecord, sentences, fields, text, documentURI);
457 if (sentenceSpan == null) {
458 continue;
459 }
460
461 for (final String name : DSA_FIELDS) {
462 String value = thisRecord.getValue(name);
463 if (value != null) {
464 fields.put("dsa-" + name, value);
465 }
466 }
467
468
469 String sources = thisRecord.getValue("nested-source");
470 addSources(sources, agentRecords, lastRecords, sentenceSpan, fields, documentURI, text);
471
472
473 tsvWriter.append(getTsvString(documentURI, fields));
474
475
476 LOGGER.debug(fields.get("type").toString());
477 LOGGER.debug(fields.toString());
478 for (String expression : fields.get("expression")) {
479 LOGGER.debug(expression);
480 Span span = new Span(expression);
481 LOGGER.debug(span.apply(text));
482 }
483 System.out.println();
484 }
485 }
486
487 private static Span getSentenceSpan(Record record, RecordSet sentences, Multimap<String, String> fields, String text, String documentURI) {
488
489 Span ret = null;
490 Span okSpan = record.getSpan().align(text);
491
492 if (okSpan.end == 0 || okSpan.begin == okSpan.end) {
493 return ret;
494 }
495
496 for (final Record sentenceRecord : sentences.getRecords()) {
497 final Span span = sentenceRecord.getSpan();
498 if (span.contains(okSpan)) {
499 ret = span;
500 fields.put("sentence", okSpan.toString());
501 break;
502 }
503 }
504
505 if (ret == null) {
506 LOGGER.warn("Could not locate sentence for span {} in {}", okSpan.toString(), documentURI);
507 }
508
509 return ret;
510 }
511
512 private static void addSources(String nestedSource, HashMap<String, Record> agentRecords,
513 HashMultimap<String, Record> lastRecords, Span sentenceSpan,
514 Multimap<String, String> fields, String documentURI, String text) {
515 addSources(nestedSource, agentRecords, lastRecords, sentenceSpan, fields, documentURI, text, null, null);
516 }
517
518 private static void addSources(String nestedSource, HashMap<String, Record> agentRecords,
519 HashMultimap<String, Record> lastRecords, Span sentenceSpan,
520 Multimap<String, String> fields, String documentURI, String text,
521 @Nullable Multimap<Span, String> highlights,
522 @Nullable Set<String> otherSources) {
523 if (nestedSource != null) {
524 List<String> sources = parseList(nestedSource);
525
526 if (sources.size() > 0) {
527 String last = sources.get(sources.size() - 1);
528
529 addSourceFromRecord(agentRecords.get(last), sentenceSpan, fields, documentURI, text, highlights, otherSources);
530 for (Record record : lastRecords.get(last)) {
531 addSourceFromRecord(record, sentenceSpan, fields, documentURI, text, highlights, otherSources);
532 }
533 }
534 }
535
536 }
537
538 private static void addSourceFromRecord(Record record, Span sentenceSpan, Multimap<String, String> fields,
539 String documentURI, String text) {
540 addSourceFromRecord(record, sentenceSpan, fields, documentURI, text, null, null);
541 }
542
543 private static void addSourceFromRecord(Record record, Span sentenceSpan, Multimap<String, String> fields,
544 String documentURI, String text,
545 @Nullable Multimap<Span, String> highlights,
546 @Nullable Set<String> otherSources) {
547 if (record == null) {
548 return;
549 }
550
551 final Span span = record.getSpan().align(text);
552 if (span.end == 0) {
553 return;
554 }
555
556 span.check(text, documentURI);
557 fields.put("source", span.toString());
558 if (sentenceSpan.contains(span)) {
559 fields.put("source-local", span.toString());
560 if (highlights != null) {
561 highlights.put(span, "source");
562 }
563 }
564 else {
565 if (sentenceSpan.overlaps(span)) {
566 LOGGER.warn("Source span " + span
567 + " only overlapping with sentence span "
568 + sentenceSpan + " in " + documentURI);
569 }
570 if (otherSources != null) {
571 otherSources.add(span.apply(text));
572 }
573 }
574
575 }
576
577 private static List<String> parseList(String sourceString) {
578 List<String> ret = new ArrayList<>();
579
580 String[] parts = sourceString.split(",");
581 for (String part : parts) {
582 part = part.trim();
583 if (part.length() > 0) {
584 ret.add(part);
585 }
586 }
587
588 return ret;
589 }
590
591 private static CharSequence getTsvString(String documentURI, Multimap<String, String> fields) {
592 StringBuilder ret = new StringBuilder();
593
594 ret.append("document=").append(documentURI);
595 for (final String name : TSV_FIELDS) {
596 final List<String> values = Lists.newArrayList();
597 for (final String value : fields.get(name)) {
598 if (value != null) {
599 values.add(value);
600 }
601 }
602 if (!values.isEmpty()) {
603 ret.append("\t").append(name).append("=")
604 .append(Joiner.on("|").join(values).replace('\t', ' '));
605 }
606 }
607 ret.append("\n");
608
609 return ret.toString();
610 }
611
612 private static String readText(@Nullable final File file) throws IOException {
613 if (file == null || !file.exists()) {
614 return "";
615 }
616 try (Reader reader = RecordSet.read(file)) {
617 return CharStreams.toString(reader);
618 }
619 }
620
621 private static Writer write(final File file) throws IOException {
622 Files.createParentDirs(file);
623 return new BufferedWriter(new OutputStreamWriter(new FileOutputStream(file),
624 Charsets.UTF_8));
625 }
626
627 private static File resolve(@Nullable final File base, final String name) {
628 final File actualBase = base != null ? base : new File(System.getProperty("user.dir"));
629 return actualBase.toPath().resolve(Paths.get(name)).toFile();
630 }
631
632 private static String fixText(final String documentURI, String text, final RecordSet sentences) {
633
634
635
636
637 if (documentURI.endsWith("xbank_wsj_0583")) {
638 text = text.substring(0, 2263) + " " + text.substring(2263);
639 }
640 else if (documentURI.endsWith("ula_IZ-060316-01-Trans-1")) {
641 text = text.substring(0, 10174) + text.substring(10176);
642 }
643 else if (documentURI.endsWith("ula_AFGP-2002-600175-Trans")) {
644 text = text.substring(0, 7903) + " " + text.substring(7906);
645 }
646 else if (documentURI.endsWith("ula_chapter-10")) {
647 text = text.substring(0, 46929) + text.substring(46932);
648 }
649 else if (documentURI.endsWith("ula_AFGP-2002-600002-Trans")) {
650 text = text.substring(0, 9902) + text.substring(9905, 9938) + text.substring(9941);
651 }
652
653 final List<Span> sentenceSpans = Lists.newArrayList();
654 for (final Record sentenceRecord : sentences.getRecords()) {
655 sentenceSpans.add(sentenceRecord.getSpan());
656 }
657 Collections.sort(sentenceSpans);
658
659
660 final StringBuilder builder = new StringBuilder(text);
661 boolean insideTag = false;
662 for (int i = 0; i < builder.length(); ++i) {
663 final char c = builder.charAt(i);
664 if (c == '<') {
665 insideTag = true;
666 builder.setCharAt(i, ' ');
667 }
668 else if (c == '>') {
669 insideTag = false;
670 builder.setCharAt(i, ' ');
671 }
672 else if (insideTag || c == '\n' || c == '\r' || c == '\t') {
673 builder.setCharAt(i, ' ');
674 }
675 }
676
677 for (int i = 0; i < sentenceSpans.size() - 1; ++i) {
678 final Span first = sentenceSpans.get(i);
679 final Span next = sentenceSpans.get(i + 1);
680
681
682 if (next.begin >= first.end) {
683 boolean allAlpha = true;
684 for (int j = first.end - 1; j <= next.begin; ++j) {
685 allAlpha = allAlpha && Character.isLetterOrDigit(builder.charAt(j));
686 }
687 if (allAlpha) {
688 LOGGER.warn("Boundary between " + first + " and " + next
689 + " could be wrong in " + documentURI + " ("
690 + text.substring(first.end - 1, next.begin + 1) + ")");
691 }
692 }
693
694
695 for (int j = first.end; j < next.begin; ++j) {
696 builder.setCharAt(j, ' ');
697 }
698
699
700 if (next.begin > first.end) {
701 builder.setCharAt(next.begin - 1, '\n');
702 }
703 else {
704 builder.setCharAt(isDelim(builder.charAt(next.begin)) ? next.begin
705 : next.begin - 1, '\n');
706 }
707 }
708
709 return builder.toString();
710
711 }
712
713 public static boolean isWord(final char c) {
714 return " \t\n\r,;:!?".indexOf(c) < 0;
715 }
716
717 public static boolean isDelim(final char c) {
718 return " \t\n\r,;:!?.()[]<>~`'\"-".indexOf(c) >= 0;
719 }
720
721 }