1 package eu.fbk.dkm.pikes.query;
2
3 import java.io.File;
4 import java.io.IOException;
5 import java.io.Reader;
6 import java.io.Writer;
7 import java.util.Collection;
8 import java.util.List;
9 import java.util.Map;
10 import java.util.Objects;
11 import java.util.Set;
12 import java.util.function.Consumer;
13
14 import javax.annotation.Nullable;
15
16 import com.google.common.base.Joiner;
17 import com.google.common.base.Strings;
18 import com.google.common.base.Throwables;
19 import com.google.common.collect.HashMultiset;
20 import com.google.common.collect.ImmutableList;
21 import com.google.common.collect.ImmutableMap;
22 import com.google.common.collect.ImmutableSet;
23 import com.google.common.collect.Iterables;
24 import com.google.common.collect.Lists;
25 import com.google.common.collect.Maps;
26 import com.google.common.collect.Multiset;
27 import com.google.common.collect.Ordering;
28 import com.google.common.collect.Sets;
29 import com.google.common.io.Files;
30
31 import eu.fbk.dkm.pikes.rdf.vocab.SUMO;
32 import net.didion.jwnl.data.PointerType;
33
34 import org.eclipse.rdf4j.model.Resource;
35 import org.eclipse.rdf4j.model.Statement;
36 import org.eclipse.rdf4j.model.IRI;
37 import org.eclipse.rdf4j.model.Value;
38 import org.eclipse.rdf4j.model.vocabulary.RDF;
39 import org.eclipse.rdf4j.model.vocabulary.RDFS;
40 import org.eclipse.rdf4j.model.vocabulary.SESAME;
41 import org.eclipse.rdf4j.model.vocabulary.SKOS;
42 import org.eclipse.rdf4j.rio.RDFHandlerException;
43 import org.eclipse.rdf4j.rio.Rio;
44 import org.slf4j.Logger;
45 import org.slf4j.LoggerFactory;
46
47 import ixa.kaflib.ExternalRef;
48 import ixa.kaflib.KAFDocument;
49
50 import eu.fbk.dkm.pikes.kv.KeyQuadIndex;
51 import eu.fbk.dkm.pikes.kv.KeyQuadSource;
52 import eu.fbk.dkm.pikes.query.Term.Layer;
53 import eu.fbk.dkm.pikes.resources.FrameBase;
54 import eu.fbk.dkm.pikes.resources.NAFUtils;
55 import eu.fbk.dkm.pikes.resources.Stemming;
56 import eu.fbk.dkm.pikes.resources.Sumo;
57 import eu.fbk.dkm.pikes.resources.WordNet;
58 import eu.fbk.dkm.pikes.resources.YagoTaxonomy;
59 import eu.fbk.utils.core.CommandLine;
60 import eu.fbk.dkm.pikes.rdf.vocab.KS_OLD;
61 import eu.fbk.rdfpro.AbstractRDFHandlerWrapper;
62 import eu.fbk.rdfpro.RDFHandlers;
63 import eu.fbk.rdfpro.RDFProcessors;
64 import eu.fbk.rdfpro.RDFSources;
65 import eu.fbk.rdfpro.util.IO;
66 import eu.fbk.rdfpro.util.QuadModel;
67 import eu.fbk.rdfpro.util.Statements;
68
69 public class TermExtractor {
70
71 private static final Logger LOGGER = LoggerFactory.getLogger(TermExtractor.class);
72
73 private static final Set<String> LUCENE_STOP_WORDS = ImmutableSet.of("a", "an", "and", "are",
74 "as", "at", "be", "but", "by", "for", "if", "in", "into", "is", "it", "no", "not",
75 "of", "on", "or", "such", "that", "the", "their", "then", "there", "these", "they",
76 "this", "to", "was", "will", "with", "'s");
77
78 private static final String NS_DBPEDIA = "http://dbpedia.org/resource/";
79
80 private static final Map<String, Layer> TYPE_MAP = ImmutableMap.of(YagoTaxonomy.NAMESPACE,
81 Layer.TYPE_YAGO, SUMO.NAMESPACE, Layer.TYPE_SUMO, FrameBase.NAMESPACE,
82 Layer.PREDICATE_FRB, "http://www.newsreader-project.eu/ontologies/propbank/",
83 Layer.PREDICATE_PB, "http://www.newsreader-project.eu/ontologies/nombank/",
84 Layer.PREDICATE_NB);
85
86 private static final Map<String, Layer> PROPERTY_MAP = ImmutableMap.of(FrameBase.NAMESPACE,
87 Layer.ROLE_FRB, "http://www.newsreader-project.eu/ontologies/propbank/",
88 Layer.ROLE_PB, "http://www.newsreader-project.eu/ontologies/nombank/", Layer.ROLE_NB);
89
90 private static final Set<String> RECURSIVE_ENRICHMENT_NAMESPACES = ImmutableSet.of(
91 YagoTaxonomy.NAMESPACE, FrameBase.NAMESPACE, SUMO.NAMESPACE);
92
93 private static final Map<String, String> CONCEPT_MAP = ImmutableMap.of(YagoTaxonomy.NAMESPACE,
94 "dbyago", FrameBase.NAMESPACE, "frb", NS_DBPEDIA, "dbpedia", "entity:", "entity");
95
96 private final KeyQuadSource enrichmentIndex;
97
98 public static void main(final String[] args) {
99 try {
100
101 final CommandLine cmd = CommandLine
102 .parser()
103 .withName("pikes-tex")
104 .withOption("i", "index", "use index at PATH for IRI enrichment", "PATH",
105 CommandLine.Type.FILE, true, false, false)
106 .withOption("r", "recursive", "whether to recurse into input directories")
107 .withOption("o", "output", "output base name", "PATH",
108 CommandLine.Type.STRING, true, false, true)
109 .withHeader("parses the Yovisto file and emits NAF files for each document")
110 .parse(args);
111
112
113 final boolean recursive = cmd.hasOption("r");
114 final File index = cmd.getOptionValue("i", File.class, null);
115 final File output = cmd.getOptionValue("o", File.class);
116 final List<File> files = cmd.getArgs(File.class);
117
118
119 KeyQuadIndex enrichmentIndex = null;
120 if (index != null) {
121 enrichmentIndex = new KeyQuadIndex(index);
122 LOGGER.info("Loaded enrichment index at {}", index);
123 }
124
125
126 final TermExtractor extractor = new TermExtractor(enrichmentIndex);
127 try (Writer writer = IO.utf8Writer(IO.buffer(IO.write(output.getAbsolutePath())))) {
128 extractor.extract(
129 files,
130 recursive,
131 (final List<Term> terms) -> {
132 try {
133 final Multiset<Term> termSet = HashMultiset.create(terms);
134 for (final Term term : Ordering.natural().sortedCopy(
135 termSet.elementSet())) {
136 writer.append(term.getDocument());
137 writer.append("\t");
138 writer.append(term.getLayer().getID());
139 writer.append("\t");
140 writer.append(term.getToken());
141 writer.append("\t");
142 writer.append(Integer.toString(termSet.count(term)));
143 if (!term.getAttributes().isEmpty()) {
144 for (final String key : Ordering.natural().sortedCopy(
145 term.getAttributes().keySet())) {
146 writer.append("\t");
147 writer.append(key);
148 writer.append("=");
149 writer.append(term.getAttributes().get(key));
150 }
151 }
152 writer.write("\n");
153 }
154 } catch (final IOException ex) {
155 Throwables.propagate(ex);
156 }
157 });
158 }
159
160
161 if (enrichmentIndex != null) {
162 enrichmentIndex.close();
163 }
164
165 } catch (final Throwable ex) {
166
167 CommandLine.fail(ex);
168 }
169 }
170
171 public TermExtractor(@Nullable final KeyQuadSource enrichmentIndex) {
172 this.enrichmentIndex = enrichmentIndex;
173 }
174
175 public void extract(final Iterable<File> files, final boolean recursive,
176 final Consumer<List<Term>> sink) throws IOException {
177
178
179 final List<File> allFiles = Lists.newArrayList(files);
180 if (recursive) {
181 for (final File file : files) {
182 if (file.isDirectory()) {
183 Iterables.addAll(allFiles, Files.fileTreeTraverser().preOrderTraversal(file));
184 }
185 }
186 }
187
188
189 final Map<String, File> annotationFiles = Maps.newHashMap();
190 final Map<String, File> modelFiles = Maps.newHashMap();
191 for (final File file : allFiles) {
192 if (file.isFile()) {
193 if (Rio.getParserFormatForFileName(file.getName()) != null) {
194 modelFiles.put(extractBasename(file.getName()), file);
195 } else if (extractExtension(file.getName()).startsWith(".naf")) {
196 annotationFiles.put(extractBasename(file.getName()), file);
197 }
198 }
199 }
200
201
202 final long ts = System.currentTimeMillis();
203 LOGGER.info("Processing {} annotation files, {} RDF files", annotationFiles.size(),
204 modelFiles.size());
205
206
207 int pairs = 0;
208 for (final String basename : Ordering.natural().sortedCopy(annotationFiles.keySet())) {
209 final File annotationFile = annotationFiles.get(basename);
210 final File modelFile = modelFiles.get(basename);
211 if (annotationFile != null && modelFile != null) {
212 final List<Term> result = extract(annotationFile, modelFile);
213 sink.accept(result);
214 ++pairs;
215 }
216 }
217
218
219 LOGGER.info("Processing of {} file pairs completed in {} ms", pairs,
220 System.currentTimeMillis() - ts);
221 }
222
223 public List<Term> extract(final Iterable<File> files, final boolean recursive)
224 throws IOException {
225
226 final List<Term> result = Lists.newArrayList();
227 extract(files, recursive, (final List<Term> t) -> {
228 result.addAll(t);
229 });
230 return result;
231 }
232
233 public List<Term> extract(final File annotationFile, final File modelFile) throws IOException {
234
235
236 final KAFDocument annotation;
237 try (Reader reader = IO.utf8Reader(IO.buffer(IO.read(annotationFile.getAbsolutePath())))) {
238 annotation = KAFDocument.createFromStream(reader);
239 }
240
241
242 final QuadModel model = QuadModel.create();
243 try {
244 RDFSources.read(false, true, null, null, null, true, modelFile.getAbsolutePath()).emit(
245 new AbstractRDFHandlerWrapper(RDFHandlers.wrap(model)) {
246
247 @Override
248 public void handleStatement(final Statement stmt)
249 throws RDFHandlerException {
250 super.handleStatement(Statements.VALUE_FACTORY.createStatement(
251 stmt.getSubject(), stmt.getPredicate(), stmt.getObject()));
252 }
253
254 }, 1);
255 } catch (final RDFHandlerException ex) {
256 throw new IOException(ex);
257 }
258
259
260 return extract(annotation, model);
261 }
262
263 public List<Term> extract(final KAFDocument document, final Iterable<Statement> model) {
264
265
266 String documentID = document.getPublic().publicId;
267 if (Strings.isNullOrEmpty(documentID)) {
268 documentID = extractBasename(document.getPublic().uri);
269 }
270
271
272 final QuadModel quadModel = model instanceof QuadModel ? (QuadModel) model
273 : QuadModel.create(model);
274
275 try {
276
277 if (this.enrichmentIndex != null) {
278 final Set<IRI> uris = Sets.newHashSet();
279 for (final Statement stmt : quadModel) {
280 for (final Value value : new Value[] { stmt.getSubject(), stmt.getPredicate(),
281 stmt.getObject(), stmt.getContext() }) {
282 if (value instanceof IRI) {
283 uris.add((IRI) value);
284 }
285 }
286 }
287 final int numTriplesBefore = quadModel.size();
288 this.enrichmentIndex.getRecursive(uris, (final Value v) -> {
289 return v instanceof IRI &&
290 RECURSIVE_ENRICHMENT_NAMESPACES.contains(((IRI) v).getNamespace());
291 }, RDFHandlers.wrap(quadModel));
292 LOGGER.debug("Enriched {} IRIs with {} triples", uris.size(), quadModel.size()
293 - numTriplesBefore);
294 }
295
296
297 final int numTriplesBefore = quadModel.size();
298 RDFProcessors.rdfs(RDFSources.wrap(ImmutableList.copyOf(quadModel)), SESAME.NIL, true,
299 true, "rdfs4a", "rdfs4b", "rdfs8").apply(RDFSources.NIL,
300 RDFHandlers.wrap(quadModel), 1);
301 LOGGER.debug("Inferred {} triples (total {})", quadModel.size() - numTriplesBefore,
302 quadModel.size());
303
304 } catch (final RDFHandlerException ex) {
305
306 Throwables.propagate(ex);
307 }
308
309
310 final List<Term> terms = Lists.newArrayList();
311 extract(documentID, document, terms);
312 extract(documentID, quadModel, terms);
313 return terms;
314 }
315
316 private void extract(final String documentID, final QuadModel model,
317 final Collection<Term> terms) {
318
319
320 final List<IRI> entities = Lists.newArrayList();
321 final Set<IRI> knownEntities = Sets.newHashSet();
322 for (final Resource entity : model.filter(null, RDF.TYPE, KS_OLD.ENTITY).subjects()) {
323 if (entity instanceof IRI) {
324 final IRI uri = (IRI) entity;
325 entities.add(uri);
326 if (uri.getNamespace().equals(NS_DBPEDIA)) {
327 terms.add(new Term(documentID, Layer.URI_DBPEDIA, uri.getLocalName()));
328 knownEntities.add(uri);
329 }
330
331
332
333
334
335 }
336 }
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351 for (final IRI entity : entities) {
352 final Set<IRI> types = Sets.newHashSet();
353 for (final Value type : model.filter(entity, RDF.TYPE, null).objects()) {
354 if (type instanceof IRI) {
355 types.add((IRI) type);
356 }
357 }
358 final Set<IRI> parents = Sets.newHashSet();
359 for (final IRI type : types) {
360 if (!FrameBase.isMicroframe(type)) {
361 for (final Value parentType : model.filter(type, RDFS.SUBCLASSOF, null)
362 .objects()) {
363 if (parentType instanceof IRI && !parentType.equals(type)) {
364 parents.add((IRI) parentType);
365 }
366 }
367 }
368 }
369 final Set<IRI> directTypes = Sets.difference(types, parents);
370 for (final IRI type : types) {
371 final Layer typeLayer = TYPE_MAP.get(type.getNamespace());
372 if (typeLayer != null) {
373 if (directTypes.contains(type)) {
374 terms.add(new Term(documentID, typeLayer, type.getLocalName()));
375 } else {
376 terms.add(new Term(documentID, typeLayer, type.getLocalName(),
377 "inherited", true));
378 }
379 }
380 }
381 }
382
383
384 for (final IRI entity : entities) {
385 final Set<Statement> stmts = Sets.newHashSet(model.filter(entity, null, null));
386 final Set<Statement> parentStmts = Sets.newHashSet();
387 for (final Statement stmt : stmts) {
388 final IRI pred = stmt.getPredicate();
389 final Value obj = stmt.getObject();
390 for (final Value parentPred : model.filter(pred, RDFS.SUBPROPERTYOF, null)
391 .objects()) {
392 if (parentPred instanceof IRI && !parentPred.equals(pred)) {
393 parentStmts.add(Statements.VALUE_FACTORY.createStatement(entity, (IRI) parentPred, obj));
394 }
395 }
396 }
397 final Set<Statement> directStmts = Sets.difference(stmts, parentStmts);
398 for (final Statement stmt : stmts) {
399 final IRI uri = stmt.getPredicate();
400 final Layer propertyLayer = PROPERTY_MAP.get(uri.getNamespace());
401 if (propertyLayer != null) {
402 if (directStmts.contains(stmt)) {
403 terms.add(new Term(documentID, propertyLayer, uri.getLocalName()));
404 } else {
405 terms.add(new Term(documentID, propertyLayer, uri.getLocalName(),
406 "inherited", true));
407 }
408 }
409 }
410 }
411
412
413 for (final IRI entity : entities) {
414 final Set<IRI> concepts = Sets.newHashSet();
415 final Set<IRI> directConcepts = Sets.newHashSet();
416 final List<IRI> queue = Lists.newLinkedList();
417
418 for (final Value type : model.filter(entity, RDF.TYPE, null).objects()) {
419 if (type instanceof IRI && CONCEPT_MAP.containsKey(((IRI) type).getNamespace())) {
420 directConcepts.add((IRI) type);
421 }
422 }
423 for (final IRI type : ImmutableList.copyOf(directConcepts)) {
424 if (!FrameBase.isMicroframe(type)) {
425 final Set<Value> parents = Sets.newHashSet(model.filter(type, RDFS.SUBCLASSOF,
426 null).objects());
427 parents.remove(type);
428 directConcepts.removeAll(parents);
429 }
430 }
431
432 if (knownEntities.contains(entity)) {
433 directConcepts.add(entity);
434 }
435
436 concepts.addAll(directConcepts);
437 queue.addAll(directConcepts);
438 while (!queue.isEmpty()) {
439 final IRI uri = queue.remove(0);
440 for (final Value parent : model.filter(uri, SKOS.BROADER, null).objects()) {
441 if (parent instanceof IRI) {
442 final IRI parentIRI = (IRI) parent;
443 if (CONCEPT_MAP.containsKey(parentIRI.getNamespace())
444 && !concepts.contains(parentIRI)) {
445 concepts.add(parentIRI);
446 queue.add(parentIRI);
447 }
448 }
449 }
450 }
451 for (final IRI concept : concepts) {
452 final String prefix = CONCEPT_MAP.get(concept.getNamespace());
453 final String name = prefix + ":" + concept.getLocalName();
454 if (directConcepts.contains(concept)) {
455 terms.add(new Term(documentID, Layer.CONCEPT, name));
456 } else {
457 terms.add(new Term(documentID, Layer.CONCEPT, name, "inherited", true));
458 }
459 }
460 }
461 }
462
463 private void extract(final String documentID, final KAFDocument document,
464 final Collection<Term> terms) {
465
466
467 terms.add(new Term(documentID, Layer.RAW, document.getRawText().replace('\n', ' ')
468 .replace('\r', ' ').replace('\t', ' ')));
469
470
471 for (final ixa.kaflib.Term term : document.getTerms()) {
472
473
474 final String wf = term.getStr().trim();
475
476
477 if (!isValidTerm(wf)) {
478 continue;
479 }
480
481
482 final String stem = Stemming.stem("en", wf.toLowerCase());
483 terms.add(new Term(documentID, Layer.STEM_TEXT, stem));
484
485
486 final String lemma = term.getLemma().toLowerCase();
487 terms.add(new Term(documentID, Layer.LEMMA_TEXT, lemma));
488
489
490 for (final String subWord : SubWordExtractor.extract(wf)) {
491 if (isValidTerm(subWord)) {
492 final String subWordStem = Stemming.stem("en", subWord.toLowerCase());
493 terms.add(new Term(documentID, Layer.STEM_SUBWORD, subWordStem));
494 }
495 }
496
497
498 final String pos = term.getMorphofeat();
499 final String wnPos;
500 if (pos.startsWith("NN")) {
501 wnPos = WordNet.POS_NOUN;
502 } else if (pos.startsWith("VB")) {
503 wnPos = WordNet.POS_VERB;
504 } else if (pos.startsWith("JJ")) {
505 wnPos = WordNet.POS_ADJECTIVE;
506 } else if (pos.startsWith("RB") || pos.equals("WRB")) {
507 wnPos = WordNet.POS_ADVERB;
508 } else {
509 wnPos = null;
510 }
511
512
513 if (wnPos != null) {
514 final List<String> synsets = WordNet.getSynsetsForLemma(lemma, wnPos);
515 if (!synsets.isEmpty()) {
516 Set<String> synsetsCertain = null;
517 for (final String synset : synsets) {
518 if (synsetsCertain == null) {
519 synsetsCertain = WordNet.getHypernyms(synset, true);
520 } else {
521 synsetsCertain.retainAll(WordNet.getHypernyms(synset, true));
522 }
523 }
524 String synset = null;
525 if (synsets.size() == 1) {
526 synset = synsets.get(0);
527 } else {
528 final ExternalRef synsetRef = NAFUtils.getRef(term, "wn30-ukb", null);
529 if (synsetRef != null) {
530 synset = synsetRef.getReference();
531 }
532 }
533 if (synset != null) {
534 expandSynsets(documentID, synset, 0, synsetsCertain, Sets.newHashSet(),
535 terms);
536 if (synsetsCertain.contains(synset)) {
537 for (final String synonym : WordNet.getLemmas(synset)) {
538 terms.add(new Term(documentID, Layer.LEMMA_SYNONYM, synonym));
539 terms.add(new Term(documentID, Layer.STEM_SYNONYM, Stemming.stem(
540 "en", synonym)));
541 }
542 final Set<String> relatedSynsets = Sets.newHashSet();
543 for (final PointerType pt : new PointerType[] { PointerType.DERIVED,
544 PointerType.PERTAINYM, PointerType.NOMINALIZATION,
545 PointerType.PARTICIPLE_OF }) {
546 relatedSynsets.addAll(WordNet.getGenericSet(synset, pt));
547 }
548 final Set<String> relatedLemmas = Sets.newHashSet();
549 for (final String relatedSynset : relatedSynsets) {
550 relatedLemmas.addAll(WordNet.getLemmas(relatedSynset));
551 terms.add(new Term(documentID, Layer.SYNSET_RELATED,
552 relatedSynset, "certain", true));
553 }
554 for (final String relatedLemma : relatedLemmas) {
555 terms.add(new Term(documentID, Layer.LEMMA_RELATED, relatedLemma));
556 terms.add(new Term(documentID, Layer.STEM_RELATED, Stemming.stem(
557 "en", relatedLemma)));
558 }
559 }
560 }
561 }
562 }
563 }
564 }
565
566 private void expandSynsets(final String documentID, final String synset, final int len,
567 final Set<String> synsetsCertain, final Set<String> synsetsSeen,
568 final Collection<Term> terms) {
569 if (synsetsSeen.add(synset)) {
570 final boolean certain = synsetsCertain == null || synsetsCertain.contains(synset);
571 if (len == 0) {
572 terms.add(new Term(documentID, Layer.SYNSET_SPECIFIC, synset, "certain", certain));
573 } else {
574 terms.add(new Term(documentID, Layer.SYNSET_HYPERNYN, synset, "certain", certain,
575 "len", len));
576 }
577 for (final String hypernym : WordNet.getHypernyms(synset, false)) {
578 expandSynsets(documentID, hypernym, len + 1, synsetsCertain, synsetsSeen, terms);
579 }
580 }
581 }
582
583 private static boolean isValidTerm(final String wf) {
584 if (wf.length() >= 2 && wf.length() <= 200
585 && !LUCENE_STOP_WORDS.contains(wf.toLowerCase())) {
586 for (int i = 0; i < wf.length(); ++i) {
587 if (Character.isLetterOrDigit(wf.charAt(i))) {
588 return true;
589 }
590 }
591 }
592 return false;
593 }
594
595 private static String extractBasename(final String location) {
596 Objects.requireNonNull(location);
597 int extEnd = location.length() - (location.endsWith("/") ? 1 : 0);
598 if (location.indexOf(':') >= 0) {
599 int index = location.lastIndexOf('#');
600 extEnd = index < 0 ? extEnd : index;
601 index = location.lastIndexOf('?', extEnd);
602 extEnd = index < 0 ? extEnd : index;
603 }
604 final int nameStart = Math.max(-1, location.lastIndexOf('/', extEnd - 1)) + 1;
605 int extStart = location.lastIndexOf('.', extEnd);
606 final String ext = extStart < 0 ? "" : location.substring(extStart, extEnd);
607 if (ext.equals(".gz") || ext.equals(".bz2") || ext.equals(".xz") || ext.equals(".7z")
608 || ext.equals(".lz4")) {
609 final int index = location.lastIndexOf('.', extStart - 1);
610 extStart = index < 0 ? extStart : index;
611 }
612 return location.substring(nameStart, extStart);
613 }
614
615 private static String extractExtension(final String location) {
616 Objects.requireNonNull(location);
617 final int index = location.indexOf(':');
618 int extEnd = location.length();
619 if (index >= 0) {
620 if (location.charAt(0) == '.') {
621 return location.substring(0, index);
622 }
623 int index2 = location.lastIndexOf('#');
624 extEnd = index2 < 0 ? extEnd : index2;
625 index2 = location.lastIndexOf('?', extEnd);
626 extEnd = index2 < 0 ? extEnd : index2;
627 }
628 int extStart = location.lastIndexOf('.', extEnd);
629 String ext = extStart < 0 ? "" : location.substring(extStart, extEnd);
630 if (ext.equals(".gz") || ext.equals(".bz2") || ext.equals(".xz") || ext.equals(".7z")
631 || ext.equals(".lz4")) {
632 extStart = location.lastIndexOf('.', extStart - 1);
633 ext = extStart < 0 ? "" : location.substring(extStart, extEnd);
634 }
635 return ext;
636 }
637
638 private static final class SubWordExtractor {
639
640
641
642 private static final int LOWER = 0x01;
643
644 private static final int UPPER = 0x02;
645
646 private static final int DIGIT = 0x04;
647
648 private static final int SUBWORD_DELIM = 0x08;
649
650 private static final int ALPHA = LOWER | UPPER;
651
652 private static final byte[] WORD_DELIM_TABLE;
653
654 static {
655 final byte[] tab = new byte[256];
656 for (int i = 0; i < 256; i++) {
657 byte code = 0;
658 if (Character.isLowerCase(i)) {
659 code |= LOWER;
660 } else if (Character.isUpperCase(i)) {
661 code |= UPPER;
662 } else if (Character.isDigit(i)) {
663 code |= DIGIT;
664 }
665 if (code == 0) {
666 code = SUBWORD_DELIM;
667 }
668 tab[i] = code;
669 }
670 WORD_DELIM_TABLE = tab;
671 }
672
673 private static int charType(final int ch) {
674 if (ch < WORD_DELIM_TABLE.length) {
675 return WORD_DELIM_TABLE[ch];
676 } else if (Character.isLowerCase(ch)) {
677 return LOWER;
678 } else if (Character.isLetter(ch)) {
679 return UPPER;
680 } else {
681 return SUBWORD_DELIM;
682 }
683 }
684
685 static Set<String> extract(final String token) {
686 final List<String> subTokens = Lists.newArrayList();
687 final int len = token.length();
688 if (len != 0) {
689 int start = 0;
690 int type = charType(token.charAt(start));
691 while (start < len) {
692 while ((type & SUBWORD_DELIM) != 0 && ++start < len) {
693 type = charType(token.charAt(start));
694 }
695 int pos = start;
696 int lastType = type;
697 while (pos < len) {
698 if (type != lastType && ((lastType & UPPER) == 0 || (type & LOWER) == 0)) {
699 subTokens.add(token.substring(start, pos));
700 break;
701 }
702 if (++pos >= len) {
703 subTokens.add(token.substring(start, pos));
704 break;
705 }
706 lastType = type;
707 type = charType(token.charAt(pos));
708 }
709 start = pos;
710 }
711 final int numtok = subTokens.size();
712 if (numtok > 1) {
713 subTokens.add(Joiner.on("").join(subTokens));
714 String tok = subTokens.get(0);
715 boolean isWord = (charType(tok.charAt(0)) & ALPHA) != 0;
716 boolean wasWord = isWord;
717 for (int i = 0; i < numtok;) {
718 int j;
719 for (j = i + 1; j < numtok; j++) {
720 wasWord = isWord;
721 tok = subTokens.get(j);
722 isWord = (charType(tok.charAt(0)) & ALPHA) != 0;
723 if (isWord != wasWord) {
724 break;
725 }
726 }
727 subTokens.add(Joiner.on("").join(subTokens.subList(i, j)));
728 i = j;
729 }
730 }
731 }
732 subTokens.add(token);
733 return ImmutableSet.copyOf(subTokens);
734 }
735
736 }
737
738 }