1   package eu.fbk.dkm.pikes.query;
2   
3   import java.io.File;
4   import java.io.IOException;
5   import java.io.Reader;
6   import java.io.Writer;
7   import java.util.Collection;
8   import java.util.List;
9   import java.util.Map;
10  import java.util.Objects;
11  import java.util.Set;
12  import java.util.function.Consumer;
13  
14  import javax.annotation.Nullable;
15  
16  import com.google.common.base.Joiner;
17  import com.google.common.base.Strings;
18  import com.google.common.base.Throwables;
19  import com.google.common.collect.HashMultiset;
20  import com.google.common.collect.ImmutableList;
21  import com.google.common.collect.ImmutableMap;
22  import com.google.common.collect.ImmutableSet;
23  import com.google.common.collect.Iterables;
24  import com.google.common.collect.Lists;
25  import com.google.common.collect.Maps;
26  import com.google.common.collect.Multiset;
27  import com.google.common.collect.Ordering;
28  import com.google.common.collect.Sets;
29  import com.google.common.io.Files;
30  
31  import eu.fbk.dkm.pikes.rdf.vocab.SUMO;
32  import net.didion.jwnl.data.PointerType;
33  
34  import org.eclipse.rdf4j.model.Resource;
35  import org.eclipse.rdf4j.model.Statement;
36  import org.eclipse.rdf4j.model.IRI;
37  import org.eclipse.rdf4j.model.Value;
38  import org.eclipse.rdf4j.model.vocabulary.RDF;
39  import org.eclipse.rdf4j.model.vocabulary.RDFS;
40  import org.eclipse.rdf4j.model.vocabulary.SESAME;
41  import org.eclipse.rdf4j.model.vocabulary.SKOS;
42  import org.eclipse.rdf4j.rio.RDFHandlerException;
43  import org.eclipse.rdf4j.rio.Rio;
44  import org.slf4j.Logger;
45  import org.slf4j.LoggerFactory;
46  
47  import ixa.kaflib.ExternalRef;
48  import ixa.kaflib.KAFDocument;
49  
50  import eu.fbk.dkm.pikes.kv.KeyQuadIndex;
51  import eu.fbk.dkm.pikes.kv.KeyQuadSource;
52  import eu.fbk.dkm.pikes.query.Term.Layer;
53  import eu.fbk.dkm.pikes.resources.FrameBase;
54  import eu.fbk.dkm.pikes.resources.NAFUtils;
55  import eu.fbk.dkm.pikes.resources.Stemming;
56  import eu.fbk.dkm.pikes.resources.Sumo;
57  import eu.fbk.dkm.pikes.resources.WordNet;
58  import eu.fbk.dkm.pikes.resources.YagoTaxonomy;
59  import eu.fbk.utils.core.CommandLine;
60  import eu.fbk.dkm.pikes.rdf.vocab.KS_OLD;
61  import eu.fbk.rdfpro.AbstractRDFHandlerWrapper;
62  import eu.fbk.rdfpro.RDFHandlers;
63  import eu.fbk.rdfpro.RDFProcessors;
64  import eu.fbk.rdfpro.RDFSources;
65  import eu.fbk.rdfpro.util.IO;
66  import eu.fbk.rdfpro.util.QuadModel;
67  import eu.fbk.rdfpro.util.Statements;
68  
69  public class TermExtractor {
70  
71      private static final Logger LOGGER = LoggerFactory.getLogger(TermExtractor.class);
72  
73      private static final Set<String> LUCENE_STOP_WORDS = ImmutableSet.of("a", "an", "and", "are",
74              "as", "at", "be", "but", "by", "for", "if", "in", "into", "is", "it", "no", "not",
75              "of", "on", "or", "such", "that", "the", "their", "then", "there", "these", "they",
76              "this", "to", "was", "will", "with", "'s"); // added 's
77  
78      private static final String NS_DBPEDIA = "http://dbpedia.org/resource/";
79  
80      private static final Map<String, Layer> TYPE_MAP = ImmutableMap.of(YagoTaxonomy.NAMESPACE,
81              Layer.TYPE_YAGO, SUMO.NAMESPACE, Layer.TYPE_SUMO, FrameBase.NAMESPACE,
82              Layer.PREDICATE_FRB, "http://www.newsreader-project.eu/ontologies/propbank/",
83              Layer.PREDICATE_PB, "http://www.newsreader-project.eu/ontologies/nombank/",
84              Layer.PREDICATE_NB);
85  
86      private static final Map<String, Layer> PROPERTY_MAP = ImmutableMap.of(FrameBase.NAMESPACE,
87              Layer.ROLE_FRB, "http://www.newsreader-project.eu/ontologies/propbank/",
88              Layer.ROLE_PB, "http://www.newsreader-project.eu/ontologies/nombank/", Layer.ROLE_NB);
89  
90      private static final Set<String> RECURSIVE_ENRICHMENT_NAMESPACES = ImmutableSet.of(
91              YagoTaxonomy.NAMESPACE, FrameBase.NAMESPACE, SUMO.NAMESPACE);
92  
93      private static final Map<String, String> CONCEPT_MAP = ImmutableMap.of(YagoTaxonomy.NAMESPACE,
94              "dbyago", FrameBase.NAMESPACE, "frb", NS_DBPEDIA, "dbpedia", "entity:", "entity");
95  
96      private final KeyQuadSource enrichmentIndex;
97  
98      public static void main(final String[] args) {
99          try {
100             // Parse command line
101             final CommandLine cmd = CommandLine
102                     .parser()
103                     .withName("pikes-tex")
104                     .withOption("i", "index", "use index at PATH for IRI enrichment", "PATH",
105                             CommandLine.Type.FILE, true, false, false)
106                     .withOption("r", "recursive", "whether to recurse into input directories")
107                     .withOption("o", "output", "output base name", "PATH",
108                             CommandLine.Type.STRING, true, false, true)
109                     .withHeader("parses the Yovisto file and emits NAF files for each document")
110                     .parse(args);
111 
112             // Extract options
113             final boolean recursive = cmd.hasOption("r");
114             final File index = cmd.getOptionValue("i", File.class, null);
115             final File output = cmd.getOptionValue("o", File.class);
116             final List<File> files = cmd.getArgs(File.class);
117 
118             // Initialize enrichment index, if enabled
119             KeyQuadIndex enrichmentIndex = null;
120             if (index != null) {
121                 enrichmentIndex = new KeyQuadIndex(index);
122                 LOGGER.info("Loaded enrichment index at {}", index);
123             }
124 
125             // Perform the extraction and write the results
126             final TermExtractor extractor = new TermExtractor(enrichmentIndex);
127             try (Writer writer = IO.utf8Writer(IO.buffer(IO.write(output.getAbsolutePath())))) {
128                 extractor.extract(
129                         files,
130                         recursive,
131                         (final List<Term> terms) -> {
132                             try {
133                                 final Multiset<Term> termSet = HashMultiset.create(terms);
134                                 for (final Term term : Ordering.natural().sortedCopy(
135                                         termSet.elementSet())) {
136                                     writer.append(term.getDocument());
137                                     writer.append("\t");
138                                     writer.append(term.getLayer().getID());
139                                     writer.append("\t");
140                                     writer.append(term.getToken());
141                                     writer.append("\t");
142                                     writer.append(Integer.toString(termSet.count(term)));
143                                     if (!term.getAttributes().isEmpty()) {
144                                         for (final String key : Ordering.natural().sortedCopy(
145                                                 term.getAttributes().keySet())) {
146                                             writer.append("\t");
147                                             writer.append(key);
148                                             writer.append("=");
149                                             writer.append(term.getAttributes().get(key));
150                                         }
151                                     }
152                                     writer.write("\n");
153                                 }
154                             } catch (final IOException ex) {
155                                 Throwables.propagate(ex);
156                             }
157                         });
158             }
159 
160             // Release enrichment index, if used
161             if (enrichmentIndex != null) {
162                 enrichmentIndex.close();
163             }
164 
165         } catch (final Throwable ex) {
166             // Display error information and terminate
167             CommandLine.fail(ex);
168         }
169     }
170 
171     public TermExtractor(@Nullable final KeyQuadSource enrichmentIndex) {
172         this.enrichmentIndex = enrichmentIndex;
173     }
174 
175     public void extract(final Iterable<File> files, final boolean recursive,
176             final Consumer<List<Term>> sink) throws IOException {
177 
178         // Expand file list if recursive
179         final List<File> allFiles = Lists.newArrayList(files);
180         if (recursive) {
181             for (final File file : files) {
182                 if (file.isDirectory()) {
183                     Iterables.addAll(allFiles, Files.fileTreeTraverser().preOrderTraversal(file));
184                 }
185             }
186         }
187 
188         // Index NAF files and RDF files by name (without extension and folder
189         final Map<String, File> annotationFiles = Maps.newHashMap();
190         final Map<String, File> modelFiles = Maps.newHashMap();
191         for (final File file : allFiles) {
192             if (file.isFile()) {
193                 if (Rio.getParserFormatForFileName(file.getName()) != null) {
194                     modelFiles.put(extractBasename(file.getName()), file);
195                 } else if (extractExtension(file.getName()).startsWith(".naf")) {
196                     annotationFiles.put(extractBasename(file.getName()), file);
197                 }
198             }
199         }
200 
201         // Log before processing
202         final long ts = System.currentTimeMillis();
203         LOGGER.info("Processing {} annotation files, {} RDF files", annotationFiles.size(),
204                 modelFiles.size());
205 
206         // Process each annotation / RDF file pair, aggregating the results
207         int pairs = 0;
208         for (final String basename : Ordering.natural().sortedCopy(annotationFiles.keySet())) {
209             final File annotationFile = annotationFiles.get(basename);
210             final File modelFile = modelFiles.get(basename);
211             if (annotationFile != null && modelFile != null) {
212                 final List<Term> result = extract(annotationFile, modelFile);
213                 sink.accept(result);
214                 ++pairs;
215             }
216         }
217 
218         // Log after processing
219         LOGGER.info("Processing of {} file pairs completed in {} ms", pairs,
220                 System.currentTimeMillis() - ts);
221     }
222 
223     public List<Term> extract(final Iterable<File> files, final boolean recursive)
224             throws IOException {
225 
226         final List<Term> result = Lists.newArrayList();
227         extract(files, recursive, (final List<Term> t) -> {
228             result.addAll(t);
229         });
230         return result;
231     }
232 
233     public List<Term> extract(final File annotationFile, final File modelFile) throws IOException {
234 
235         // Read annotation file
236         final KAFDocument annotation;
237         try (Reader reader = IO.utf8Reader(IO.buffer(IO.read(annotationFile.getAbsolutePath())))) {
238             annotation = KAFDocument.createFromStream(reader);
239         }
240 
241         // Read RDF file
242         final QuadModel model = QuadModel.create();
243         try {
244             RDFSources.read(false, true, null, null, null, true, modelFile.getAbsolutePath()).emit(
245                     new AbstractRDFHandlerWrapper(RDFHandlers.wrap(model)) {
246 
247                         @Override
248                         public void handleStatement(final Statement stmt)
249                                 throws RDFHandlerException {
250                             super.handleStatement(Statements.VALUE_FACTORY.createStatement(
251                                     stmt.getSubject(), stmt.getPredicate(), stmt.getObject()));
252                         }
253 
254                     }, 1);
255         } catch (final RDFHandlerException ex) {
256             throw new IOException(ex);
257         }
258 
259         // Delegate
260         return extract(annotation, model);
261     }
262 
263     public List<Term> extract(final KAFDocument document, final Iterable<Statement> model) {
264 
265         // Obtain document ID from NAF document
266         String documentID = document.getPublic().publicId;
267         if (Strings.isNullOrEmpty(documentID)) {
268             documentID = extractBasename(document.getPublic().uri);
269         }
270 
271         // Obtain a quad model over RDF statements
272         final QuadModel quadModel = model instanceof QuadModel ? (QuadModel) model //
273                 : QuadModel.create(model);
274 
275         try {
276             // Recursively enrich model IRIs if an enrichment index is available
277             if (this.enrichmentIndex != null) {
278                 final Set<IRI> uris = Sets.newHashSet();
279                 for (final Statement stmt : quadModel) {
280                     for (final Value value : new Value[] { stmt.getSubject(), stmt.getPredicate(),
281                             stmt.getObject(), stmt.getContext() }) {
282                         if (value instanceof IRI) {
283                             uris.add((IRI) value);
284                         }
285                     }
286                 }
287                 final int numTriplesBefore = quadModel.size();
288                 this.enrichmentIndex.getRecursive(uris, (final Value v) -> {
289                     return v instanceof IRI && //
290                             RECURSIVE_ENRICHMENT_NAMESPACES.contains(((IRI) v).getNamespace());
291                 }, RDFHandlers.wrap(quadModel));
292                 LOGGER.debug("Enriched {} IRIs with {} triples", uris.size(), quadModel.size()
293                         - numTriplesBefore);
294             }
295 
296             // Perform inference
297             final int numTriplesBefore = quadModel.size();
298             RDFProcessors.rdfs(RDFSources.wrap(ImmutableList.copyOf(quadModel)), SESAME.NIL, true,
299                     true, "rdfs4a", "rdfs4b", "rdfs8").apply(RDFSources.NIL,
300                     RDFHandlers.wrap(quadModel), 1);
301             LOGGER.debug("Inferred {} triples (total {})", quadModel.size() - numTriplesBefore,
302                     quadModel.size());
303 
304         } catch (final RDFHandlerException ex) {
305             // Wrap and propagate
306             Throwables.propagate(ex);
307         }
308 
309         // Process NAF and model
310         final List<Term> terms = Lists.newArrayList();
311         extract(documentID, document, terms);
312         extract(documentID, quadModel, terms);
313         return terms;
314     }
315 
316     private void extract(final String documentID, final QuadModel model,
317             final Collection<Term> terms) {
318 
319         // Emit terms for IRIs
320         final List<IRI> entities = Lists.newArrayList();
321         final Set<IRI> knownEntities = Sets.newHashSet();
322         for (final Resource entity : model.filter(null, RDF.TYPE, KS_OLD.ENTITY).subjects()) {
323             if (entity instanceof IRI) {
324                 final IRI uri = (IRI) entity;
325                 entities.add(uri);
326                 if (uri.getNamespace().equals(NS_DBPEDIA)) {
327                     terms.add(new Term(documentID, Layer.URI_DBPEDIA, uri.getLocalName()));
328                     knownEntities.add(uri);
329                 }
330                 // TODO: entity:XXX treated as anonymous instances
331                 //                else if (model.contains(uri, FOAF.NAME, null)) {
332                 //                    terms.add(new Term(documentID, Layer.IRI_CUSTOM, uri.getLocalName()));
333                 //                    knownEntities.add(uri);
334                 //                }
335             }
336         }
337 
338         // Emit related entities TODO
339         //        for (final IRI entity : entities) {
340         //            for (final Value related : model.filter(entity, SKOS.RELATED, null).objects()) {
341         //                if (related instanceof IRI) {
342         //                    final IRI uri = (IRI) related;
343         //                    if (uri.getNamespace().equals(NS_DBPEDIA)) {
344         //                        terms.add(new Term(documentID, Layer.IRI_RELATED, uri.getLocalName()));
345         //                    }
346         //                }
347         //            }
348         //        }
349 
350         // Emit types / predicates
351         for (final IRI entity : entities) {
352             final Set<IRI> types = Sets.newHashSet();
353             for (final Value type : model.filter(entity, RDF.TYPE, null).objects()) {
354                 if (type instanceof IRI) {
355                     types.add((IRI) type);
356                 }
357             }
358             final Set<IRI> parents = Sets.newHashSet();
359             for (final IRI type : types) {
360                 if (!FrameBase.isMicroframe(type)) {
361                     for (final Value parentType : model.filter(type, RDFS.SUBCLASSOF, null)
362                             .objects()) {
363                         if (parentType instanceof IRI && !parentType.equals(type)) {
364                             parents.add((IRI) parentType);
365                         }
366                     }
367                 }
368             }
369             final Set<IRI> directTypes = Sets.difference(types, parents);
370             for (final IRI type : types) {
371                 final Layer typeLayer = TYPE_MAP.get(type.getNamespace());
372                 if (typeLayer != null) {
373                     if (directTypes.contains(type)) {
374                         terms.add(new Term(documentID, typeLayer, type.getLocalName()));
375                     } else {
376                         terms.add(new Term(documentID, typeLayer, type.getLocalName(),
377                                 "inherited", true));
378                     }
379                 }
380             }
381         }
382 
383         // Emit roles
384         for (final IRI entity : entities) {
385             final Set<Statement> stmts = Sets.newHashSet(model.filter(entity, null, null));
386             final Set<Statement> parentStmts = Sets.newHashSet();
387             for (final Statement stmt : stmts) {
388                 final IRI pred = stmt.getPredicate();
389                 final Value obj = stmt.getObject();
390                 for (final Value parentPred : model.filter(pred, RDFS.SUBPROPERTYOF, null)
391                         .objects()) {
392                     if (parentPred instanceof IRI && !parentPred.equals(pred)) {
393                         parentStmts.add(Statements.VALUE_FACTORY.createStatement(entity, (IRI) parentPred, obj));
394                     }
395                 }
396             }
397             final Set<Statement> directStmts = Sets.difference(stmts, parentStmts);
398             for (final Statement stmt : stmts) {
399                 final IRI uri = stmt.getPredicate();
400                 final Layer propertyLayer = PROPERTY_MAP.get(uri.getNamespace());
401                 if (propertyLayer != null) {
402                     if (directStmts.contains(stmt)) {
403                         terms.add(new Term(documentID, propertyLayer, uri.getLocalName()));
404                     } else {
405                         terms.add(new Term(documentID, propertyLayer, uri.getLocalName(),
406                                 "inherited", true));
407                     }
408                 }
409             }
410         }
411 
412         // Emit concepts
413         for (final IRI entity : entities) {
414             final Set<IRI> concepts = Sets.newHashSet();
415             final Set<IRI> directConcepts = Sets.newHashSet();
416             final List<IRI> queue = Lists.newLinkedList();
417 
418             for (final Value type : model.filter(entity, RDF.TYPE, null).objects()) {
419                 if (type instanceof IRI && CONCEPT_MAP.containsKey(((IRI) type).getNamespace())) {
420                     directConcepts.add((IRI) type);
421                 }
422             }
423             for (final IRI type : ImmutableList.copyOf(directConcepts)) {
424                 if (!FrameBase.isMicroframe(type)) {
425                     final Set<Value> parents = Sets.newHashSet(model.filter(type, RDFS.SUBCLASSOF,
426                             null).objects());
427                     parents.remove(type);
428                     directConcepts.removeAll(parents);
429                 }
430             }
431 
432             if (knownEntities.contains(entity)) {
433                 directConcepts.add(entity);
434             }
435 
436             concepts.addAll(directConcepts);
437             queue.addAll(directConcepts);
438             while (!queue.isEmpty()) {
439                 final IRI uri = queue.remove(0);
440                 for (final Value parent : model.filter(uri, SKOS.BROADER, null).objects()) {
441                     if (parent instanceof IRI) {
442                         final IRI parentIRI = (IRI) parent;
443                         if (CONCEPT_MAP.containsKey(parentIRI.getNamespace())
444                                 && !concepts.contains(parentIRI)) {
445                             concepts.add(parentIRI);
446                             queue.add(parentIRI);
447                         }
448                     }
449                 }
450             }
451             for (final IRI concept : concepts) {
452                 final String prefix = CONCEPT_MAP.get(concept.getNamespace());
453                 final String name = prefix + ":" + concept.getLocalName();
454                 if (directConcepts.contains(concept)) {
455                     terms.add(new Term(documentID, Layer.CONCEPT, name));
456                 } else {
457                     terms.add(new Term(documentID, Layer.CONCEPT, name, "inherited", true));
458                 }
459             }
460         }
461     }
462 
463     private void extract(final String documentID, final KAFDocument document,
464             final Collection<Term> terms) {
465 
466         // Emit raw text term
467         terms.add(new Term(documentID, Layer.RAW, document.getRawText().replace('\n', ' ')
468                 .replace('\r', ' ').replace('\t', ' ')));
469 
470         // Iterate over all the tokens in the document
471         for (final ixa.kaflib.Term term : document.getTerms()) {
472 
473             // Extract lower case token
474             final String wf = term.getStr().trim();
475 
476             // Apply stop word filter
477             if (!isValidTerm(wf)) {
478                 continue;
479             }
480 
481             // Emit stem term
482             final String stem = Stemming.stem("en", wf.toLowerCase());
483             terms.add(new Term(documentID, Layer.STEM_TEXT, stem));
484 
485             // Emit lemma term
486             final String lemma = term.getLemma().toLowerCase();
487             terms.add(new Term(documentID, Layer.LEMMA_TEXT, lemma));
488 
489             // Emit subwords terms
490             for (final String subWord : SubWordExtractor.extract(wf)) {
491                 if (isValidTerm(subWord)) {
492                     final String subWordStem = Stemming.stem("en", subWord.toLowerCase());
493                     terms.add(new Term(documentID, Layer.STEM_SUBWORD, subWordStem));
494                 }
495             }
496 
497             // Extract WordNet POS (n, v, a, r)
498             final String pos = term.getMorphofeat();
499             final String wnPos;
500             if (pos.startsWith("NN")) {
501                 wnPos = WordNet.POS_NOUN;
502             } else if (pos.startsWith("VB")) {
503                 wnPos = WordNet.POS_VERB;
504             } else if (pos.startsWith("JJ")) {
505                 wnPos = WordNet.POS_ADJECTIVE;
506             } else if (pos.startsWith("RB") || pos.equals("WRB")) {
507                 wnPos = WordNet.POS_ADVERB;
508             } else {
509                 wnPos = null;
510             }
511 
512             // Emit synset terms
513             if (wnPos != null) {
514                 final List<String> synsets = WordNet.getSynsetsForLemma(lemma, wnPos);
515                 if (!synsets.isEmpty()) {
516                     Set<String> synsetsCertain = null;
517                     for (final String synset : synsets) {
518                         if (synsetsCertain == null) {
519                             synsetsCertain = WordNet.getHypernyms(synset, true);
520                         } else {
521                             synsetsCertain.retainAll(WordNet.getHypernyms(synset, true));
522                         }
523                     }
524                     String synset = null;
525                     if (synsets.size() == 1) {
526                         synset = synsets.get(0);
527                     } else {
528                         final ExternalRef synsetRef = NAFUtils.getRef(term, "wn30-ukb", null);
529                         if (synsetRef != null) {
530                             synset = synsetRef.getReference();
531                         }
532                     }
533                     if (synset != null) {
534                         expandSynsets(documentID, synset, 0, synsetsCertain, Sets.newHashSet(),
535                                 terms);
536                         if (synsetsCertain.contains(synset)) {
537                             for (final String synonym : WordNet.getLemmas(synset)) {
538                                 terms.add(new Term(documentID, Layer.LEMMA_SYNONYM, synonym));
539                                 terms.add(new Term(documentID, Layer.STEM_SYNONYM, Stemming.stem(
540                                         "en", synonym)));
541                             }
542                             final Set<String> relatedSynsets = Sets.newHashSet();
543                             for (final PointerType pt : new PointerType[] { PointerType.DERIVED,
544                                     PointerType.PERTAINYM, PointerType.NOMINALIZATION,
545                                     PointerType.PARTICIPLE_OF }) {
546                                 relatedSynsets.addAll(WordNet.getGenericSet(synset, pt));
547                             }
548                             final Set<String> relatedLemmas = Sets.newHashSet();
549                             for (final String relatedSynset : relatedSynsets) {
550                                 relatedLemmas.addAll(WordNet.getLemmas(relatedSynset));
551                                 terms.add(new Term(documentID, Layer.SYNSET_RELATED,
552                                         relatedSynset, "certain", true));
553                             }
554                             for (final String relatedLemma : relatedLemmas) {
555                                 terms.add(new Term(documentID, Layer.LEMMA_RELATED, relatedLemma));
556                                 terms.add(new Term(documentID, Layer.STEM_RELATED, Stemming.stem(
557                                         "en", relatedLemma)));
558                             }
559                         }
560                     }
561                 }
562             }
563         }
564     }
565 
566     private void expandSynsets(final String documentID, final String synset, final int len,
567             final Set<String> synsetsCertain, final Set<String> synsetsSeen,
568             final Collection<Term> terms) {
569         if (synsetsSeen.add(synset)) {
570             final boolean certain = synsetsCertain == null || synsetsCertain.contains(synset);
571             if (len == 0) {
572                 terms.add(new Term(documentID, Layer.SYNSET_SPECIFIC, synset, "certain", certain));
573             } else {
574                 terms.add(new Term(documentID, Layer.SYNSET_HYPERNYN, synset, "certain", certain,
575                         "len", len));
576             }
577             for (final String hypernym : WordNet.getHypernyms(synset, false)) {
578                 expandSynsets(documentID, hypernym, len + 1, synsetsCertain, synsetsSeen, terms);
579             }
580         }
581     }
582 
583     private static boolean isValidTerm(final String wf) {
584         if (wf.length() >= 2 && wf.length() <= 200
585                 && !LUCENE_STOP_WORDS.contains(wf.toLowerCase())) {
586             for (int i = 0; i < wf.length(); ++i) {
587                 if (Character.isLetterOrDigit(wf.charAt(i))) {
588                     return true;
589                 }
590             }
591         }
592         return false;
593     }
594 
595     private static String extractBasename(final String location) {
596         Objects.requireNonNull(location);
597         int extEnd = location.length() - (location.endsWith("/") ? 1 : 0);
598         if (location.indexOf(':') >= 0) {
599             int index = location.lastIndexOf('#');
600             extEnd = index < 0 ? extEnd : index;
601             index = location.lastIndexOf('?', extEnd);
602             extEnd = index < 0 ? extEnd : index;
603         }
604         final int nameStart = Math.max(-1, location.lastIndexOf('/', extEnd - 1)) + 1;
605         int extStart = location.lastIndexOf('.', extEnd);
606         final String ext = extStart < 0 ? "" : location.substring(extStart, extEnd);
607         if (ext.equals(".gz") || ext.equals(".bz2") || ext.equals(".xz") || ext.equals(".7z")
608                 || ext.equals(".lz4")) {
609             final int index = location.lastIndexOf('.', extStart - 1);
610             extStart = index < 0 ? extStart : index;
611         }
612         return location.substring(nameStart, extStart);
613     }
614 
615     private static String extractExtension(final String location) {
616         Objects.requireNonNull(location);
617         final int index = location.indexOf(':');
618         int extEnd = location.length();
619         if (index >= 0) {
620             if (location.charAt(0) == '.') {
621                 return location.substring(0, index);
622             }
623             int index2 = location.lastIndexOf('#');
624             extEnd = index2 < 0 ? extEnd : index2;
625             index2 = location.lastIndexOf('?', extEnd);
626             extEnd = index2 < 0 ? extEnd : index2;
627         }
628         int extStart = location.lastIndexOf('.', extEnd);
629         String ext = extStart < 0 ? "" : location.substring(extStart, extEnd);
630         if (ext.equals(".gz") || ext.equals(".bz2") || ext.equals(".xz") || ext.equals(".7z")
631                 || ext.equals(".lz4")) {
632             extStart = location.lastIndexOf('.', extStart - 1);
633             ext = extStart < 0 ? "" : location.substring(extStart, extEnd);
634         }
635         return ext;
636     }
637 
638     private static final class SubWordExtractor {
639 
640         // Taken from WordDelimiterFilter
641 
642         private static final int LOWER = 0x01;
643 
644         private static final int UPPER = 0x02;
645 
646         private static final int DIGIT = 0x04;
647 
648         private static final int SUBWORD_DELIM = 0x08;
649 
650         private static final int ALPHA = LOWER | UPPER;
651 
652         private static final byte[] WORD_DELIM_TABLE;
653 
654         static {
655             final byte[] tab = new byte[256];
656             for (int i = 0; i < 256; i++) {
657                 byte code = 0;
658                 if (Character.isLowerCase(i)) {
659                     code |= LOWER;
660                 } else if (Character.isUpperCase(i)) {
661                     code |= UPPER;
662                 } else if (Character.isDigit(i)) {
663                     code |= DIGIT;
664                 }
665                 if (code == 0) {
666                     code = SUBWORD_DELIM;
667                 }
668                 tab[i] = code;
669             }
670             WORD_DELIM_TABLE = tab;
671         }
672 
673         private static int charType(final int ch) {
674             if (ch < WORD_DELIM_TABLE.length) {
675                 return WORD_DELIM_TABLE[ch];
676             } else if (Character.isLowerCase(ch)) {
677                 return LOWER;
678             } else if (Character.isLetter(ch)) {
679                 return UPPER;
680             } else {
681                 return SUBWORD_DELIM;
682             }
683         }
684 
685         static Set<String> extract(final String token) {
686             final List<String> subTokens = Lists.newArrayList();
687             final int len = token.length();
688             if (len != 0) {
689                 int start = 0;
690                 int type = charType(token.charAt(start));
691                 while (start < len) {
692                     while ((type & SUBWORD_DELIM) != 0 && ++start < len) {
693                         type = charType(token.charAt(start));
694                     }
695                     int pos = start;
696                     int lastType = type;
697                     while (pos < len) {
698                         if (type != lastType && ((lastType & UPPER) == 0 || (type & LOWER) == 0)) {
699                             subTokens.add(token.substring(start, pos));
700                             break;
701                         }
702                         if (++pos >= len) {
703                             subTokens.add(token.substring(start, pos));
704                             break;
705                         }
706                         lastType = type;
707                         type = charType(token.charAt(pos));
708                     }
709                     start = pos;
710                 }
711                 final int numtok = subTokens.size();
712                 if (numtok > 1) {
713                     subTokens.add(Joiner.on("").join(subTokens));
714                     String tok = subTokens.get(0);
715                     boolean isWord = (charType(tok.charAt(0)) & ALPHA) != 0;
716                     boolean wasWord = isWord;
717                     for (int i = 0; i < numtok;) {
718                         int j;
719                         for (j = i + 1; j < numtok; j++) {
720                             wasWord = isWord;
721                             tok = subTokens.get(j);
722                             isWord = (charType(tok.charAt(0)) & ALPHA) != 0;
723                             if (isWord != wasWord) {
724                                 break;
725                             }
726                         }
727                         subTokens.add(Joiner.on("").join(subTokens.subList(i, j)));
728                         i = j;
729                     }
730                 }
731             }
732             subTokens.add(token);
733             return ImmutableSet.copyOf(subTokens);
734         }
735 
736     }
737 
738 }