1   package eu.fbk.dkm.pikes.query;
3   import java.io.File;
4   import java.io.IOException;
5   import java.io.Reader;
6   import java.io.Writer;
7   import java.util.Collection;
8   import java.util.List;
9   import java.util.Map;
10  import java.util.Objects;
11  import java.util.Set;
12  import java.util.function.Consumer;
14  import javax.annotation.Nullable;
16  import com.google.common.base.Joiner;
17  import com.google.common.base.Strings;
18  import com.google.common.base.Throwables;
19  import com.google.common.collect.HashMultiset;
20  import com.google.common.collect.ImmutableList;
21  import com.google.common.collect.ImmutableMap;
22  import com.google.common.collect.ImmutableSet;
23  import com.google.common.collect.Iterables;
24  import com.google.common.collect.Lists;
25  import com.google.common.collect.Maps;
26  import com.google.common.collect.Multiset;
27  import com.google.common.collect.Ordering;
28  import com.google.common.collect.Sets;
29  import com.google.common.io.Files;
31  import eu.fbk.dkm.pikes.rdf.vocab.SUMO;
32  import net.didion.jwnl.data.PointerType;
34  import org.eclipse.rdf4j.model.Resource;
35  import org.eclipse.rdf4j.model.Statement;
36  import org.eclipse.rdf4j.model.IRI;
37  import org.eclipse.rdf4j.model.Value;
38  import org.eclipse.rdf4j.model.vocabulary.RDF;
39  import org.eclipse.rdf4j.model.vocabulary.RDFS;
40  import org.eclipse.rdf4j.model.vocabulary.SESAME;
41  import org.eclipse.rdf4j.model.vocabulary.SKOS;
42  import org.eclipse.rdf4j.rio.RDFHandlerException;
43  import org.eclipse.rdf4j.rio.Rio;
44  import org.slf4j.Logger;
45  import org.slf4j.LoggerFactory;
47  import ixa.kaflib.ExternalRef;
48  import ixa.kaflib.KAFDocument;
50  import eu.fbk.dkm.pikes.kv.KeyQuadIndex;
51  import eu.fbk.dkm.pikes.kv.KeyQuadSource;
52  import eu.fbk.dkm.pikes.query.Term.Layer;
53  import eu.fbk.dkm.pikes.resources.FrameBase;
54  import eu.fbk.dkm.pikes.resources.NAFUtils;
55  import eu.fbk.dkm.pikes.resources.Stemming;
56  import eu.fbk.dkm.pikes.resources.Sumo;
57  import eu.fbk.dkm.pikes.resources.WordNet;
58  import eu.fbk.dkm.pikes.resources.YagoTaxonomy;
59  import eu.fbk.utils.core.CommandLine;
60  import eu.fbk.dkm.pikes.rdf.vocab.KS_OLD;
61  import eu.fbk.rdfpro.AbstractRDFHandlerWrapper;
62  import eu.fbk.rdfpro.RDFHandlers;
63  import eu.fbk.rdfpro.RDFProcessors;
64  import eu.fbk.rdfpro.RDFSources;
65  import eu.fbk.rdfpro.util.IO;
66  import eu.fbk.rdfpro.util.QuadModel;
67  import eu.fbk.rdfpro.util.Statements;
69  public class TermExtractor {
71      private static final Logger LOGGER = LoggerFactory.getLogger(TermExtractor.class);
73      private static final Set<String> LUCENE_STOP_WORDS = ImmutableSet.of("a", "an", "and", "are",
74              "as", "at", "be", "but", "by", "for", "if", "in", "into", "is", "it", "no", "not",
75              "of", "on", "or", "such", "that", "the", "their", "then", "there", "these", "they",
76              "this", "to", "was", "will", "with", "'s"); // added 's
78      private static final String NS_DBPEDIA = "http://dbpedia.org/resource/";
80      private static final Map<String, Layer> TYPE_MAP = ImmutableMap.of(YagoTaxonomy.NAMESPACE,
81              Layer.TYPE_YAGO, SUMO.NAMESPACE, Layer.TYPE_SUMO, FrameBase.NAMESPACE,
82              Layer.PREDICATE_FRB, "http://www.newsreader-project.eu/ontologies/propbank/",
83              Layer.PREDICATE_PB, "http://www.newsreader-project.eu/ontologies/nombank/",
84              Layer.PREDICATE_NB);
86      private static final Map<String, Layer> PROPERTY_MAP = ImmutableMap.of(FrameBase.NAMESPACE,
87              Layer.ROLE_FRB, "http://www.newsreader-project.eu/ontologies/propbank/",
88              Layer.ROLE_PB, "http://www.newsreader-project.eu/ontologies/nombank/", Layer.ROLE_NB);
90      private static final Set<String> RECURSIVE_ENRICHMENT_NAMESPACES = ImmutableSet.of(
91              YagoTaxonomy.NAMESPACE, FrameBase.NAMESPACE, SUMO.NAMESPACE);
93      private static final Map<String, String> CONCEPT_MAP = ImmutableMap.of(YagoTaxonomy.NAMESPACE,
94              "dbyago", FrameBase.NAMESPACE, "frb", NS_DBPEDIA, "dbpedia", "entity:", "entity");
96      private final KeyQuadSource enrichmentIndex;
98      public static void main(final String[] args) {
99          try {
100             // Parse command line
101             final CommandLine cmd = CommandLine
102                     .parser()
103                     .withName("pikes-tex")
104                     .withOption("i", "index", "use index at PATH for IRI enrichment", "PATH",
105                             CommandLine.Type.FILE, true, false, false)
106                     .withOption("r", "recursive", "whether to recurse into input directories")
107                     .withOption("o", "output", "output base name", "PATH",
108                             CommandLine.Type.STRING, true, false, true)
109                     .withHeader("parses the Yovisto file and emits NAF files for each document")
110                     .parse(args);
112             // Extract options
113             final boolean recursive = cmd.hasOption("r");
114             final File index = cmd.getOptionValue("i", File.class, null);
115             final File output = cmd.getOptionValue("o", File.class);
116             final List<File> files = cmd.getArgs(File.class);
118             // Initialize enrichment index, if enabled
119             KeyQuadIndex enrichmentIndex = null;
120             if (index != null) {
121                 enrichmentIndex = new KeyQuadIndex(index);
122                 LOGGER.info("Loaded enrichment index at {}", index);
123             }
125             // Perform the extraction and write the results
126             final TermExtractor extractor = new TermExtractor(enrichmentIndex);
127             try (Writer writer = IO.utf8Writer(IO.buffer(IO.write(output.getAbsolutePath())))) {
128                 extractor.extract(
129                         files,
130                         recursive,
131                         (final List<Term> terms) -> {
132                             try {
133                                 final Multiset<Term> termSet = HashMultiset.create(terms);
134                                 for (final Term term : Ordering.natural().sortedCopy(
135                                         termSet.elementSet())) {
136                                     writer.append(term.getDocument());
137                                     writer.append("\t");
138                                     writer.append(term.getLayer().getID());
139                                     writer.append("\t");
140                                     writer.append(term.getToken());
141                                     writer.append("\t");
142                                     writer.append(Integer.toString(termSet.count(term)));
143                                     if (!term.getAttributes().isEmpty()) {
144                                         for (final String key : Ordering.natural().sortedCopy(
145                                                 term.getAttributes().keySet())) {
146                                             writer.append("\t");
147                                             writer.append(key);
148                                             writer.append("=");
149                                             writer.append(term.getAttributes().get(key));
150                                         }
151                                     }
152                                     writer.write("\n");
153                                 }
154                             } catch (final IOException ex) {
155                                 Throwables.propagate(ex);
156                             }
157                         });
158             }
160             // Release enrichment index, if used
161             if (enrichmentIndex != null) {
162                 enrichmentIndex.close();
163             }
165         } catch (final Throwable ex) {
166             // Display error information and terminate
167             CommandLine.fail(ex);
168         }
169     }
171     public TermExtractor(@Nullable final KeyQuadSource enrichmentIndex) {
172         this.enrichmentIndex = enrichmentIndex;
173     }
175     public void extract(final Iterable<File> files, final boolean recursive,
176             final Consumer<List<Term>> sink) throws IOException {
178         // Expand file list if recursive
179         final List<File> allFiles = Lists.newArrayList(files);
180         if (recursive) {
181             for (final File file : files) {
182                 if (file.isDirectory()) {
183                     Iterables.addAll(allFiles, Files.fileTreeTraverser().preOrderTraversal(file));
184                 }
185             }
186         }
188         // Index NAF files and RDF files by name (without extension and folder
189         final Map<String, File> annotationFiles = Maps.newHashMap();
190         final Map<String, File> modelFiles = Maps.newHashMap();
191         for (final File file : allFiles) {
192             if (file.isFile()) {
193                 if (Rio.getParserFormatForFileName(file.getName()) != null) {
194                     modelFiles.put(extractBasename(file.getName()), file);
195                 } else if (extractExtension(file.getName()).startsWith(".naf")) {
196                     annotationFiles.put(extractBasename(file.getName()), file);
197                 }
198             }
199         }
201         // Log before processing
202         final long ts = System.currentTimeMillis();
203         LOGGER.info("Processing {} annotation files, {} RDF files", annotationFiles.size(),
204                 modelFiles.size());
206         // Process each annotation / RDF file pair, aggregating the results
207         int pairs = 0;
208         for (final String basename : Ordering.natural().sortedCopy(annotationFiles.keySet())) {
209             final File annotationFile = annotationFiles.get(basename);
210             final File modelFile = modelFiles.get(basename);
211             if (annotationFile != null && modelFile != null) {
212                 final List<Term> result = extract(annotationFile, modelFile);
213                 sink.accept(result);
214                 ++pairs;
215             }
216         }
218         // Log after processing
219         LOGGER.info("Processing of {} file pairs completed in {} ms", pairs,
220                 System.currentTimeMillis() - ts);
221     }
223     public List<Term> extract(final Iterable<File> files, final boolean recursive)
224             throws IOException {
226         final List<Term> result = Lists.newArrayList();
227         extract(files, recursive, (final List<Term> t) -> {
228             result.addAll(t);
229         });
230         return result;
231     }
233     public List<Term> extract(final File annotationFile, final File modelFile) throws IOException {
235         // Read annotation file
236         final KAFDocument annotation;
237         try (Reader reader = IO.utf8Reader(IO.buffer(IO.read(annotationFile.getAbsolutePath())))) {
238             annotation = KAFDocument.createFromStream(reader);
239         }
241         // Read RDF file
242         final QuadModel model = QuadModel.create();
243         try {
244             RDFSources.read(false, true, null, null, null, true, modelFile.getAbsolutePath()).emit(
245                     new AbstractRDFHandlerWrapper(RDFHandlers.wrap(model)) {
247                         @Override
248                         public void handleStatement(final Statement stmt)
249                                 throws RDFHandlerException {
250                             super.handleStatement(Statements.VALUE_FACTORY.createStatement(
251                                     stmt.getSubject(), stmt.getPredicate(), stmt.getObject()));
252                         }
254                     }, 1);
255         } catch (final RDFHandlerException ex) {
256             throw new IOException(ex);
257         }
259         // Delegate
260         return extract(annotation, model);
261     }
263     public List<Term> extract(final KAFDocument document, final Iterable<Statement> model) {
265         // Obtain document ID from NAF document
266         String documentID = document.getPublic().publicId;
267         if (Strings.isNullOrEmpty(documentID)) {
268             documentID = extractBasename(document.getPublic().uri);
269         }
271         // Obtain a quad model over RDF statements
272         final QuadModel quadModel = model instanceof QuadModel ? (QuadModel) model //
273                 : QuadModel.create(model);
275         try {
276             // Recursively enrich model IRIs if an enrichment index is available
277             if (this.enrichmentIndex != null) {
278                 final Set<IRI> uris = Sets.newHashSet();
279                 for (final Statement stmt : quadModel) {
280                     for (final Value value : new Value[] { stmt.getSubject(), stmt.getPredicate(),
281                             stmt.getObject(), stmt.getContext() }) {
282                         if (value instanceof IRI) {
283                             uris.add((IRI) value);
284                         }
285                     }
286                 }
287                 final int numTriplesBefore = quadModel.size();
288                 this.enrichmentIndex.getRecursive(uris, (final Value v) -> {
289                     return v instanceof IRI && //
290                             RECURSIVE_ENRICHMENT_NAMESPACES.contains(((IRI) v).getNamespace());
291                 }, RDFHandlers.wrap(quadModel));
292                 LOGGER.debug("Enriched {} IRIs with {} triples", uris.size(), quadModel.size()
293                         - numTriplesBefore);
294             }
296             // Perform inference
297             final int numTriplesBefore = quadModel.size();
298             RDFProcessors.rdfs(RDFSources.wrap(ImmutableList.copyOf(quadModel)), SESAME.NIL, true,
299                     true, "rdfs4a", "rdfs4b", "rdfs8").apply(RDFSources.NIL,
300                     RDFHandlers.wrap(quadModel), 1);
301             LOGGER.debug("Inferred {} triples (total {})", quadModel.size() - numTriplesBefore,
302                     quadModel.size());
304         } catch (final RDFHandlerException ex) {
305             // Wrap and propagate
306             Throwables.propagate(ex);
307         }
309         // Process NAF and model
310         final List<Term> terms = Lists.newArrayList();
311         extract(documentID, document, terms);
312         extract(documentID, quadModel, terms);
313         return terms;
314     }
316     private void extract(final String documentID, final QuadModel model,
317             final Collection<Term> terms) {
319         // Emit terms for IRIs
320         final List<IRI> entities = Lists.newArrayList();
321         final Set<IRI> knownEntities = Sets.newHashSet();
322         for (final Resource entity : model.filter(null, RDF.TYPE, KS_OLD.ENTITY).subjects()) {
323             if (entity instanceof IRI) {
324                 final IRI uri = (IRI) entity;
325                 entities.add(uri);
326                 if (uri.getNamespace().equals(NS_DBPEDIA)) {
327                     terms.add(new Term(documentID, Layer.URI_DBPEDIA, uri.getLocalName()));
328                     knownEntities.add(uri);
329                 }
330                 // TODO: entity:XXX treated as anonymous instances
331                 //                else if (model.contains(uri, FOAF.NAME, null)) {
332                 //                    terms.add(new Term(documentID, Layer.IRI_CUSTOM, uri.getLocalName()));
333                 //                    knownEntities.add(uri);
334                 //                }
335             }
336         }
338         // Emit related entities TODO
339         //        for (final IRI entity : entities) {
340         //            for (final Value related : model.filter(entity, SKOS.RELATED, null).objects()) {
341         //                if (related instanceof IRI) {
342         //                    final IRI uri = (IRI) related;
343         //                    if (uri.getNamespace().equals(NS_DBPEDIA)) {
344         //                        terms.add(new Term(documentID, Layer.IRI_RELATED, uri.getLocalName()));
345         //                    }
346         //                }
347         //            }
348         //        }
350         // Emit types / predicates
351         for (final IRI entity : entities) {
352             final Set<IRI> types = Sets.newHashSet();
353             for (final Value type : model.filter(entity, RDF.TYPE, null).objects()) {
354                 if (type instanceof IRI) {
355                     types.add((IRI) type);
356                 }
357             }
358             final Set<IRI> parents = Sets.newHashSet();
359             for (final IRI type : types) {
360                 if (!FrameBase.isMicroframe(type)) {
361                     for (final Value parentType : model.filter(type, RDFS.SUBCLASSOF, null)
362                             .objects()) {
363                         if (parentType instanceof IRI && !parentType.equals(type)) {
364                             parents.add((IRI) parentType);
365                         }
366                     }
367                 }
368             }
369             final Set<IRI> directTypes = Sets.difference(types, parents);
370             for (final IRI type : types) {
371                 final Layer typeLayer = TYPE_MAP.get(type.getNamespace());
372                 if (typeLayer != null) {
373                     if (directTypes.contains(type)) {
374                         terms.add(new Term(documentID, typeLayer, type.getLocalName()));
375                     } else {
376                         terms.add(new Term(documentID, typeLayer, type.getLocalName(),
377                                 "inherited", true));
378                     }
379                 }
380             }
381         }
383         // Emit roles
384         for (final IRI entity : entities) {
385             final Set<Statement> stmts = Sets.newHashSet(model.filter(entity, null, null));
386             final Set<Statement> parentStmts = Sets.newHashSet();
387             for (final Statement stmt : stmts) {
388                 final IRI pred = stmt.getPredicate();
389                 final Value obj = stmt.getObject();
390                 for (final Value parentPred : model.filter(pred, RDFS.SUBPROPERTYOF, null)
391                         .objects()) {
392                     if (parentPred instanceof IRI && !parentPred.equals(pred)) {
393                         parentStmts.add(Statements.VALUE_FACTORY.createStatement(entity, (IRI) parentPred, obj));
394                     }
395                 }
396             }
397             final Set<Statement> directStmts = Sets.difference(stmts, parentStmts);
398             for (final Statement stmt : stmts) {
399                 final IRI uri = stmt.getPredicate();
400                 final Layer propertyLayer = PROPERTY_MAP.get(uri.getNamespace());
401                 if (propertyLayer != null) {
402                     if (directStmts.contains(stmt)) {
403                         terms.add(new Term(documentID, propertyLayer, uri.getLocalName()));
404                     } else {
405                         terms.add(new Term(documentID, propertyLayer, uri.getLocalName(),
406                                 "inherited", true));
407                     }
408                 }
409             }
410         }
412         // Emit concepts
413         for (final IRI entity : entities) {
414             final Set<IRI> concepts = Sets.newHashSet();
415             final Set<IRI> directConcepts = Sets.newHashSet();
416             final List<IRI> queue = Lists.newLinkedList();
418             for (final Value type : model.filter(entity, RDF.TYPE, null).objects()) {
419                 if (type instanceof IRI && CONCEPT_MAP.containsKey(((IRI) type).getNamespace())) {
420                     directConcepts.add((IRI) type);
421                 }
422             }
423             for (final IRI type : ImmutableList.copyOf(directConcepts)) {
424                 if (!FrameBase.isMicroframe(type)) {
425                     final Set<Value> parents = Sets.newHashSet(model.filter(type, RDFS.SUBCLASSOF,
426                             null).objects());
427                     parents.remove(type);
428                     directConcepts.removeAll(parents);
429                 }
430             }
432             if (knownEntities.contains(entity)) {
433                 directConcepts.add(entity);
434             }
436             concepts.addAll(directConcepts);
437             queue.addAll(directConcepts);
438             while (!queue.isEmpty()) {
439                 final IRI uri = queue.remove(0);
440                 for (final Value parent : model.filter(uri, SKOS.BROADER, null).objects()) {
441                     if (parent instanceof IRI) {
442                         final IRI parentIRI = (IRI) parent;
443                         if (CONCEPT_MAP.containsKey(parentIRI.getNamespace())
444                                 && !concepts.contains(parentIRI)) {
445                             concepts.add(parentIRI);
446                             queue.add(parentIRI);
447                         }
448                     }
449                 }
450             }
451             for (final IRI concept : concepts) {
452                 final String prefix = CONCEPT_MAP.get(concept.getNamespace());
453                 final String name = prefix + ":" + concept.getLocalName();
454                 if (directConcepts.contains(concept)) {
455                     terms.add(new Term(documentID, Layer.CONCEPT, name));
456                 } else {
457                     terms.add(new Term(documentID, Layer.CONCEPT, name, "inherited", true));
458                 }
459             }
460         }
461     }
463     private void extract(final String documentID, final KAFDocument document,
464             final Collection<Term> terms) {
466         // Emit raw text term
467         terms.add(new Term(documentID, Layer.RAW, document.getRawText().replace('\n', ' ')
468                 .replace('\r', ' ').replace('\t', ' ')));
470         // Iterate over all the tokens in the document
471         for (final ixa.kaflib.Term term : document.getTerms()) {
473             // Extract lower case token
474             final String wf = term.getStr().trim();
476             // Apply stop word filter
477             if (!isValidTerm(wf)) {
478                 continue;
479             }
481             // Emit stem term
482             final String stem = Stemming.stem("en", wf.toLowerCase());
483             terms.add(new Term(documentID, Layer.STEM_TEXT, stem));
485             // Emit lemma term
486             final String lemma = term.getLemma().toLowerCase();
487             terms.add(new Term(documentID, Layer.LEMMA_TEXT, lemma));
489             // Emit subwords terms
490             for (final String subWord : SubWordExtractor.extract(wf)) {
491                 if (isValidTerm(subWord)) {
492                     final String subWordStem = Stemming.stem("en", subWord.toLowerCase());
493                     terms.add(new Term(documentID, Layer.STEM_SUBWORD, subWordStem));
494                 }
495             }
497             // Extract WordNet POS (n, v, a, r)
498             final String pos = term.getMorphofeat();
499             final String wnPos;
500             if (pos.startsWith("NN")) {
501                 wnPos = WordNet.POS_NOUN;
502             } else if (pos.startsWith("VB")) {
503                 wnPos = WordNet.POS_VERB;
504             } else if (pos.startsWith("JJ")) {
505                 wnPos = WordNet.POS_ADJECTIVE;
506             } else if (pos.startsWith("RB") || pos.equals("WRB")) {
507                 wnPos = WordNet.POS_ADVERB;
508             } else {
509                 wnPos = null;
510             }
512             // Emit synset terms
513             if (wnPos != null) {
514                 final List<String> synsets = WordNet.getSynsetsForLemma(lemma, wnPos);
515                 if (!synsets.isEmpty()) {
516                     Set<String> synsetsCertain = null;
517                     for (final String synset : synsets) {
518                         if (synsetsCertain == null) {
519                             synsetsCertain = WordNet.getHypernyms(synset, true);
520                         } else {
521                             synsetsCertain.retainAll(WordNet.getHypernyms(synset, true));
522                         }
523                     }
524                     String synset = null;
525                     if (synsets.size() == 1) {
526                         synset = synsets.get(0);
527                     } else {
528                         final ExternalRef synsetRef = NAFUtils.getRef(term, "wn30-ukb", null);
529                         if (synsetRef != null) {
530                             synset = synsetRef.getReference();
531                         }
532                     }
533                     if (synset != null) {
534                         expandSynsets(documentID, synset, 0, synsetsCertain, Sets.newHashSet(),
535                                 terms);
536                         if (synsetsCertain.contains(synset)) {
537                             for (final String synonym : WordNet.getLemmas(synset)) {
538                                 terms.add(new Term(documentID, Layer.LEMMA_SYNONYM, synonym));
539                                 terms.add(new Term(documentID, Layer.STEM_SYNONYM, Stemming.stem(
540                                         "en", synonym)));
541                             }
542                             final Set<String> relatedSynsets = Sets.newHashSet();
543                             for (final PointerType pt : new PointerType[] { PointerType.DERIVED,
544                                     PointerType.PERTAINYM, PointerType.NOMINALIZATION,
545                                     PointerType.PARTICIPLE_OF }) {
546                                 relatedSynsets.addAll(WordNet.getGenericSet(synset, pt));
547                             }
548                             final Set<String> relatedLemmas = Sets.newHashSet();
549                             for (final String relatedSynset : relatedSynsets) {
550                                 relatedLemmas.addAll(WordNet.getLemmas(relatedSynset));
551                                 terms.add(new Term(documentID, Layer.SYNSET_RELATED,
552                                         relatedSynset, "certain", true));
553                             }
554                             for (final String relatedLemma : relatedLemmas) {
555                                 terms.add(new Term(documentID, Layer.LEMMA_RELATED, relatedLemma));
556                                 terms.add(new Term(documentID, Layer.STEM_RELATED, Stemming.stem(
557                                         "en", relatedLemma)));
558                             }
559                         }
560                     }
561                 }
562             }
563         }
564     }
566     private void expandSynsets(final String documentID, final String synset, final int len,
567             final Set<String> synsetsCertain, final Set<String> synsetsSeen,
568             final Collection<Term> terms) {
569         if (synsetsSeen.add(synset)) {
570             final boolean certain = synsetsCertain == null || synsetsCertain.contains(synset);
571             if (len == 0) {
572                 terms.add(new Term(documentID, Layer.SYNSET_SPECIFIC, synset, "certain", certain));
573             } else {
574                 terms.add(new Term(documentID, Layer.SYNSET_HYPERNYN, synset, "certain", certain,
575                         "len", len));
576             }
577             for (final String hypernym : WordNet.getHypernyms(synset, false)) {
578                 expandSynsets(documentID, hypernym, len + 1, synsetsCertain, synsetsSeen, terms);
579             }
580         }
581     }
583     private static boolean isValidTerm(final String wf) {
584         if (wf.length() >= 2 && wf.length() <= 200
585                 && !LUCENE_STOP_WORDS.contains(wf.toLowerCase())) {
586             for (int i = 0; i < wf.length(); ++i) {
587                 if (Character.isLetterOrDigit(wf.charAt(i))) {
588                     return true;
589                 }
590             }
591         }
592         return false;
593     }
595     private static String extractBasename(final String location) {
596         Objects.requireNonNull(location);
597         int extEnd = location.length() - (location.endsWith("/") ? 1 : 0);
598         if (location.indexOf(':') >= 0) {
599             int index = location.lastIndexOf('#');
600             extEnd = index < 0 ? extEnd : index;
601             index = location.lastIndexOf('?', extEnd);
602             extEnd = index < 0 ? extEnd : index;
603         }
604         final int nameStart = Math.max(-1, location.lastIndexOf('/', extEnd - 1)) + 1;
605         int extStart = location.lastIndexOf('.', extEnd);
606         final String ext = extStart < 0 ? "" : location.substring(extStart, extEnd);
607         if (ext.equals(".gz") || ext.equals(".bz2") || ext.equals(".xz") || ext.equals(".7z")
608                 || ext.equals(".lz4")) {
609             final int index = location.lastIndexOf('.', extStart - 1);
610             extStart = index < 0 ? extStart : index;
611         }
612         return location.substring(nameStart, extStart);
613     }
615     private static String extractExtension(final String location) {
616         Objects.requireNonNull(location);
617         final int index = location.indexOf(':');
618         int extEnd = location.length();
619         if (index >= 0) {
620             if (location.charAt(0) == '.') {
621                 return location.substring(0, index);
622             }
623             int index2 = location.lastIndexOf('#');
624             extEnd = index2 < 0 ? extEnd : index2;
625             index2 = location.lastIndexOf('?', extEnd);
626             extEnd = index2 < 0 ? extEnd : index2;
627         }
628         int extStart = location.lastIndexOf('.', extEnd);
629         String ext = extStart < 0 ? "" : location.substring(extStart, extEnd);
630         if (ext.equals(".gz") || ext.equals(".bz2") || ext.equals(".xz") || ext.equals(".7z")
631                 || ext.equals(".lz4")) {
632             extStart = location.lastIndexOf('.', extStart - 1);
633             ext = extStart < 0 ? "" : location.substring(extStart, extEnd);
634         }
635         return ext;
636     }
638     private static final class SubWordExtractor {
640         // Taken from WordDelimiterFilter
642         private static final int LOWER = 0x01;
644         private static final int UPPER = 0x02;
646         private static final int DIGIT = 0x04;
648         private static final int SUBWORD_DELIM = 0x08;
650         private static final int ALPHA = LOWER | UPPER;
652         private static final byte[] WORD_DELIM_TABLE;
654         static {
655             final byte[] tab = new byte[256];
656             for (int i = 0; i < 256; i++) {
657                 byte code = 0;
658                 if (Character.isLowerCase(i)) {
659                     code |= LOWER;
660                 } else if (Character.isUpperCase(i)) {
661                     code |= UPPER;
662                 } else if (Character.isDigit(i)) {
663                     code |= DIGIT;
664                 }
665                 if (code == 0) {
666                     code = SUBWORD_DELIM;
667                 }
668                 tab[i] = code;
669             }
670             WORD_DELIM_TABLE = tab;
671         }
673         private static int charType(final int ch) {
674             if (ch < WORD_DELIM_TABLE.length) {
675                 return WORD_DELIM_TABLE[ch];
676             } else if (Character.isLowerCase(ch)) {
677                 return LOWER;
678             } else if (Character.isLetter(ch)) {
679                 return UPPER;
680             } else {
681                 return SUBWORD_DELIM;
682             }
683         }
685         static Set<String> extract(final String token) {
686             final List<String> subTokens = Lists.newArrayList();
687             final int len = token.length();
688             if (len != 0) {
689                 int start = 0;
690                 int type = charType(token.charAt(start));
691                 while (start < len) {
692                     while ((type & SUBWORD_DELIM) != 0 && ++start < len) {
693                         type = charType(token.charAt(start));
694                     }
695                     int pos = start;
696                     int lastType = type;
697                     while (pos < len) {
698                         if (type != lastType && ((lastType & UPPER) == 0 || (type & LOWER) == 0)) {
699                             subTokens.add(token.substring(start, pos));
700                             break;
701                         }
702                         if (++pos >= len) {
703                             subTokens.add(token.substring(start, pos));
704                             break;
705                         }
706                         lastType = type;
707                         type = charType(token.charAt(pos));
708                     }
709                     start = pos;
710                 }
711                 final int numtok = subTokens.size();
712                 if (numtok > 1) {
713                     subTokens.add(Joiner.on("").join(subTokens));
714                     String tok = subTokens.get(0);
715                     boolean isWord = (charType(tok.charAt(0)) & ALPHA) != 0;
716                     boolean wasWord = isWord;
717                     for (int i = 0; i < numtok;) {
718                         int j;
719                         for (j = i + 1; j < numtok; j++) {
720                             wasWord = isWord;
721                             tok = subTokens.get(j);
722                             isWord = (charType(tok.charAt(0)) & ALPHA) != 0;
723                             if (isWord != wasWord) {
724                                 break;
725                             }
726                         }
727                         subTokens.add(Joiner.on("").join(subTokens.subList(i, j)));
728                         i = j;
729                     }
730                 }
731             }
732             subTokens.add(token);
733             return ImmutableSet.copyOf(subTokens);
734         }
736     }
738 }