1 package eu.fbk.dkm.pikes.eval;
2
3 import java.util.Arrays;
4 import java.util.Collection;
5 import java.util.HashMap;
6 import java.util.List;
7 import java.util.Map;
8 import java.util.Objects;
9 import java.util.Set;
10 import java.util.function.Function;
11
12 import javax.annotation.Nullable;
13
14 import com.google.common.collect.HashMultimap;
15 import com.google.common.collect.ImmutableList;
16 import com.google.common.collect.ImmutableSet;
17 import com.google.common.collect.Iterables;
18 import com.google.common.collect.Lists;
19 import com.google.common.collect.Maps;
20 import com.google.common.collect.Multimap;
21 import com.google.common.collect.Ordering;
22 import com.google.common.collect.Sets;
23
24 import org.eclipse.rdf4j.model.Literal;
25 import org.eclipse.rdf4j.model.Resource;
26 import org.eclipse.rdf4j.model.Statement;
27 import org.eclipse.rdf4j.model.IRI;
28 import org.eclipse.rdf4j.model.Value;
29 import org.eclipse.rdf4j.model.ValueFactory;
30 import org.eclipse.rdf4j.model.vocabulary.DCTERMS;
31 import org.eclipse.rdf4j.model.vocabulary.OWL;
32 import org.eclipse.rdf4j.model.vocabulary.RDF;
33 import org.eclipse.rdf4j.model.vocabulary.RDFS;
34 import org.eclipse.rdf4j.query.BindingSet;
35 import org.eclipse.rdf4j.query.algebra.TupleExpr;
36 import org.eclipse.rdf4j.rio.RDFHandler;
37 import org.slf4j.LoggerFactory;
38
39 import eu.fbk.utils.core.CommandLine;
40 import eu.fbk.utils.core.CommandLine.Type;
41 import eu.fbk.dkm.pikes.rdf.vocab.NIF;
42 import eu.fbk.rdfpro.RDFHandlers;
43 import eu.fbk.rdfpro.RDFSources;
44 import eu.fbk.rdfpro.util.QuadModel;
45 import eu.fbk.rdfpro.util.Statements;
46
47 public class Converter {
48
49 private static final Set<String> AM_ROLES = ImmutableSet.of("dir", "loc", "mnr", "ext", "rec",
50 "prd", "pnc", "cau", "dis", "adv", "mod", "neg");
51
52 private static final IRI DUL_ASSOCIATED_WITH = Statements.VALUE_FACTORY
53 .createIRI("http://www.ontologydesignpatterns.org/ont/dul/DUL.owl#associatedWith");
54
55 public static final Converter FRED_CONVERTER = new Converter(
56 "fred",
57 ""
58 + "SELECT ?uri (REPLACE(?t, '_', ' ') AS ?text)\n"
59 + "WHERE { ?uri a nif:Context ; nif:isString ?t . }\n",
60 ""
61 + "PREFIX fsem: <http://ontologydesignpatterns.org/cp/owl/semiotics.owl#>\n"
62 + "PREFIX eval: <http://pikes.fbk.eu/ontologies/eval#>\n"
63 + "SELECT ?node ?begin ?end ?head\n"
64 + "WHERE {\n"
65 + " ?m fsem:denotes|fsem:hasInterpretant ?node ;\n"
66 + " nif:beginIndex ?begin ;\n"
67 + " nif:endIndex ?end ;\n"
68 + " OPTIONAL { ?m eval:head ?head }\n"
69 + " FILTER EXISTS { ?node ?p ?o }\n"
70 + " FILTER NOT EXISTS { ?s ?node ?o }\n"
71 + " FILTER NOT EXISTS { ?s a ?node }\n"
72 + " FILTER NOT EXISTS { ?node a owl:Class }\n"
73 + "}\n"
74 + "ORDER BY ?m",
75 (final IRI uri) -> {
76 String ns = uri.getNamespace();
77 String name = uri.getLocalName();
78 if (ns.equals("http://www.ontologydesignpatterns.org/ont/vn/abox/role/")
79 || ns.equals("http://www.ontologydesignpatterns.org/ont/boxer/boxer.owl#")
80 && (name.equals("agent") || name.equals("patient") || name.equals("theme"))) {
81 ns = "http://pikes.fbk.eu/ontologies/verbnet#";
82 name = name.toLowerCase();
83 } else if (ns.equals("http://www.ontologydesignpatterns.org/ont/vn/data/")) {
84 ns = "http://pikes.fbk.eu/ontologies/verbnet#";
85 final String code = name.substring(name.lastIndexOf('_') + 1);
86 final int l = code.length();
87 final int n1 = l < 2 ? 0 : Integer.parseInt(code.substring(0, 2));
88 final int n2 = l < 4 ? 0 : Integer.parseInt(code.substring(2, 4));
89 final int n3 = l < 5 ? 0 : Character.digit(code.charAt(4), 10);
90 final int n4 = l < 6 ? 0 : Character.digit(code.charAt(5), 10);
91 final int n5 = l < 7 ? 0 : Character.digit(code.charAt(6), 10);
92 final int n6 = l < 8 ? 0 : Character.digit(code.charAt(7), 10);
93 final StringBuilder b = new StringBuilder().append(n1);
94 assert n1 >= 0 && n2 >= 0 && n3 >= 0 && n4 >= 0 && n5 >= 0 && n6 >= 0;
95 if (n2 != 0) {
96 b.append('.').append(n2);
97 if (n3 != 0) {
98 b.append('.').append(n3);
99 }
100 }
101 if (n4 != 0) {
102 b.append('-').append(n4);
103 if (n5 != 0) {
104 b.append('-').append(n5);
105 if (n6 != 0) {
106 b.append('-').append(n6);
107 }
108 }
109 }
110 name = b.toString();
111 }
112 return Statements.VALUE_FACTORY.createIRI(ns, name);
113 }, "PREFIX fsem: <http://ontologydesignpatterns.org/cp/owl/semiotics.owl#>\n"
114 + "SELECT ?s (owl:sameAs AS ?p) ?o\n "
115 + "WHERE { ?s fsem:denotes ?o. FILTER EXISTS { ?m fsem:denotes ?s } }");
116
117 public static final Converter GOLD_CONVERTER = new Converter("gold", ""
118 + "SELECT ?uri ?text\n"
119 + "WHERE { ?uri rdfs:label ?text . }\n", ""
120 + "PREFIX fsem: <http://ontologydesignpatterns.org/cp/owl/semiotics.owl#>\n"
121 + "PREFIX eval: <http://pikes.fbk.eu/ontologies/eval#>\n"
122 + "SELECT DISTINCT ?node (?node AS ?head)\n"
123 + "WHERE {\n"
124 + " { ?node a eval:Node } UNION\n"
125 + " { ?node a eval:Entity } UNION\n"
126 + " { ?node a eval:Frame } UNION\n"
127 + " { ?node a eval:Quality }\n"
128 + "}\n"
129 + "ORDER BY ?m",
130 (final IRI uri) -> {
131 final String ns = uri.getNamespace();
132 String name = uri.getLocalName();
133 if (ns.equals("http://pikes.fbk.eu/ontologies/verbnet#")) {
134 final int index = name.indexOf('-');
135 if (index > 0) {
136 name = name.substring(index + 1);
137 }
138 }
139 return Statements.VALUE_FACTORY.createIRI(ns, name);
140 });
141
142 public static final Converter PIKES_CONVERTER = new Converter("pikes", ""
143 + "PREFIX eval: <http://pikes.fbk.eu/ontologies/eval#>\n" //
144 + "SELECT ?uri ?text\n"
145 + "WHERE { ?uri a eval:Sentence ; rdfs:label ?text . }\n", ""
146 + "PREFIX gaf: <http://groundedannotationframework.org/gaf#>\n"
147 + "PREFIX eval: <http://pikes.fbk.eu/ontologies/eval#>\n"
148 + "SELECT ?node ?begin ?end ?head (?m AS ?sentence)\n"
149 + "WHERE {\n"
150 + " ?node gaf:denotedBy ?m .\n"
151 + " ?m nif:beginIndex ?begin ;\n"
152 + " nif:endIndex ?end ;\n"
153 + " OPTIONAL { ?m eval:head ?head }\n" + "}\n"
154 + "ORDER BY ?m",
155 (final IRI uri) -> {
156 String ns = uri.getNamespace();
157 String name = uri.getLocalName();
158 boolean rewriteName = false;
159 if (ns.equals("http://www.newsreader-project.eu/ontologies/propbank/")) {
160 ns = "http://pikes.fbk.eu/ontologies/propbank#";
161 rewriteName = true;
162 } else if (ns.equals("http://www.newsreader-project.eu/ontologies/nombank/")) {
163 ns = "http://pikes.fbk.eu/ontologies/nombank#";
164 rewriteName = true;
165 } else if (ns.equals("http://www.newsreader-project.eu/ontologies/verbnet/")) {
166 ns = "http://pikes.fbk.eu/ontologies/verbnet#";
167 final int index = name.indexOf('-');
168 if (index > 0) {
169 name = name.substring(index + 1);
170 }
171 } else if (ns.equals("http://www.newsreader-project.eu/ontologies/framenet/")) {
172 ns = "http://pikes.fbk.eu/ontologies/framenet#";
173 } else if (ns.equals("http://dkm.fbk.eu/ontologies/knowledgestore#")
174 && name.equals("mod")) {
175 ns = "http://www.ontologydesignpatterns.org/ont/dul/DUL.owl#";
176 name = "associatedWith";
177 }
178 if (rewriteName) {
179 if (AM_ROLES.contains(name.toLowerCase())) {
180 name = "am-" + name.toLowerCase();
181 } else if (name.endsWith("_0") || name.endsWith("_1") || name.endsWith("_2")
182 || name.endsWith("_3") || name.endsWith("_4") || name.endsWith("_5")) {
183 name = "a" + name.charAt(name.length() - 1);
184 }
185 }
186 return Statements.VALUE_FACTORY.createIRI(ns, name);
187 });
188
189 private static final Set<IRI> IGNORABLE_TERMS = ImmutableSet.of(
190 Statements.VALUE_FACTORY.createIRI("http://www.newsreader-project.eu/ontologies/propbank/adv"), //
191 Statements.VALUE_FACTORY.createIRI("http://www.newsreader-project.eu/ontologies/nombank/adv"), //
192 Statements.VALUE_FACTORY.createIRI("http://groundedannotationframework.org/gaf#denotedBy"), //
193 Statements.VALUE_FACTORY.createIRI("http://www.ontologydesignpatterns.org/ont/fred/pos.owl#boxerpos"), //
194 Statements.VALUE_FACTORY.createIRI("http://ontologydesignpatterns.org/cp/owl/semiotics.owl#denotes"), //
195 Statements.VALUE_FACTORY.createIRI("http://ontologydesignpatterns.org/cp/owl/semiotics.owl#hasInterpretant"), //
196 NIF.OFFSET_BASED_STRING, NIF.BEGIN_INDEX, NIF.END_INDEX, NIF.REFERENCE_CONTEXT);
197
198 private final String creator;
199
200 private final TupleExpr textQuery;
201
202 private final TupleExpr nodeQuery;
203
204 private final Function<IRI, IRI> uriRewriter;
205
206 private final TupleExpr[] expandQueries;
207
208 public Converter(final String creator, final String textQuery, final String nodeQuery,
209 @Nullable final Function<IRI, IRI> uriRewriter, final String... expandQueries) {
210 this.creator = Objects.requireNonNull(creator);
211 this.textQuery = Util.parse(textQuery);
212 this.nodeQuery = Util.parse(nodeQuery);
213 this.uriRewriter = uriRewriter;
214 this.expandQueries = new TupleExpr[expandQueries.length];
215 for (int i = 0; i < expandQueries.length; ++i) {
216 this.expandQueries[i] = Util.parse(expandQueries[i]);
217 }
218
219 }
220
221 public QuadModel convert(final QuadModel model) throws Throwable {
222
223 final ValueFactory vf = Statements.VALUE_FACTORY;
224 final QuadModel result = QuadModel.create();
225
226 final Map<IRI, Sentence> sentences = new HashMap<>();
227 for (final BindingSet binding : Util.query(model, this.textQuery)) {
228 final IRI uri = vf.createIRI(((IRI) binding.getValue("uri")).getNamespace());
229 final String text = binding.getValue("text").stringValue().trim();
230 sentences.put(uri, new Sentence(text));
231 }
232
233 final Map<Value, IRI> nodeSentences = Maps.newHashMap();
234 final Multimap<Value, String> nodeTerms = HashMultimap.create();
235 for (final BindingSet binding : Util.query(model, this.nodeQuery)) {
236 final IRI node = (IRI) binding.getValue("node");
237 final IRI head = (IRI) binding.getValue("head");
238 IRI sentenceIRI = (IRI) binding.getValue("sentence");
239 sentenceIRI = sentenceIRI != null ? vf.createIRI(sentenceIRI.getNamespace()) : vf
240 .createIRI(node.getNamespace());
241 final Sentence sentence = sentences.get(sentenceIRI);
242 final String term;
243 if (head != null) {
244 term = sentence.getTerm(head.getLocalName());
245 } else {
246 final int begin = ((Literal) binding.getValue("begin")).intValue();
247 final int end = ((Literal) binding.getValue("end")).intValue();
248 term = sentence.getTerm(begin, end);
249 }
250 nodeTerms.put(node, term);
251 nodeSentences.put(node, sentenceIRI);
252 }
253
254 final Set<Statement> splittingStmts = Sets.newHashSet();
255 for (final Statement stmt : model) {
256 if (EVAL.METADATA.equals(stmt.getContext())) {
257 splittingStmts.add(stmt);
258 }
259 }
260
261 for (final Map.Entry<IRI, Sentence> entry : sentences.entrySet()) {
262 final IRI sentenceIRI = entry.getKey();
263 final IRI graphIRI = vf.createIRI(sentenceIRI + "graph_" + this.creator);
264 result.add(sentenceIRI, RDF.TYPE, EVAL.SENTENCE, EVAL.METADATA);
265 result.add(sentenceIRI, RDFS.LABEL, vf.createLiteral(entry.getValue().getText()),
266 EVAL.METADATA);
267 result.add(graphIRI, RDF.TYPE, EVAL.KNOWLEDGE_GRAPH, EVAL.METADATA);
268 result.add(graphIRI, DCTERMS.SOURCE, sentenceIRI, EVAL.METADATA);
269 result.add(graphIRI, DCTERMS.CREATOR, vf.createLiteral(this.creator), EVAL.METADATA);
270 }
271
272 for (final Value node : nodeTerms.keySet()) {
273 final IRI sentenceIRI = nodeSentences.get(node);
274 final IRI graphIRI = vf.createIRI(sentenceIRI + "graph_" + this.creator);
275 final Collection<String> terms = nodeTerms.get(node);
276 for (final String term : terms) {
277 final IRI termIRI = vf.createIRI(sentenceIRI + "term_" + term);
278 final IRI nodeIRI = terms.size() == 1 ? (IRI) node : vf.createIRI(node + "_"
279 + term);
280 result.add(nodeIRI, RDF.TYPE, EVAL.NODE, graphIRI);
281 result.add(nodeIRI, EVAL.DENOTED_BY, termIRI, graphIRI);
282 }
283 }
284
285 final Set<Statement> expanded = Sets.newHashSet();
286 for (final TupleExpr expandQuery : this.expandQueries) {
287 for (final BindingSet bindings : Util.query(model, expandQuery)) {
288 final Value s = bindings.getValue("s");
289 final Value p = bindings.getValue("p");
290 final Value o = bindings.getValue("o");
291 if (s instanceof Resource && p instanceof IRI && o instanceof Value) {
292 expanded.add(vf.createStatement((Resource) s, (IRI) p, o));
293 }
294 }
295 }
296
297 for (final Statement stmt : Iterables.concat(model, expanded)) {
298 IRI pred = stmt.getPredicate();
299 Value obj = stmt.getObject();
300 if (EVAL.METADATA.equals(stmt.getContext())) {
301 continue;
302 }
303 final Resource subj = stmt.getSubject();
304 if (IGNORABLE_TERMS.contains(pred) || pred.equals(RDF.TYPE)
305 && IGNORABLE_TERMS.contains(obj)) {
306 continue;
307 }
308 if (this.uriRewriter != null) {
309 pred = this.uriRewriter.apply(pred);
310 if (pred.equals(RDF.TYPE) && obj instanceof IRI) {
311 obj = this.uriRewriter.apply((IRI) obj);
312 }
313 }
314 final Collection<String> subjTerms = nodeTerms.get(subj);
315 if (!subjTerms.isEmpty()) {
316 final IRI sentenceIRI = nodeSentences.get(subj);
317 final IRI graphIRI = vf.createIRI(sentenceIRI + "graph_" + this.creator);
318 final List<Value> subjIRIs = split(subj, subjTerms);
319 final List<Value> objValues = split(obj, nodeTerms.get(obj));
320 corefer(result, graphIRI, subjIRIs);
321 corefer(result, graphIRI, objValues);
322 boolean added = false;
323 final boolean splitting = subjIRIs.size() > 1 || objValues.size() > 1;
324 for (final Value subjIRI : subjIRIs) {
325 for (final Value objValue : objValues) {
326 final Statement s = vf.createStatement((IRI) subjIRI, pred, objValue,
327 graphIRI);
328 if (!splitting || splittingStmts.contains(s)) {
329 result.add(s);
330 added = true;
331 }
332 }
333 }
334 if (!added) {
335 throw new IllegalArgumentException("Could not split statement: "
336 + vf.createStatement(subj, pred, obj, stmt.getContext()) + "\nsubj: "
337 + subjIRIs + "\nobj: " + objValues);
338 }
339 }
340 }
341
342 return result;
343 }
344
345 public static void replaceNominalFrames(final QuadModel model) {
346
347 for (final Resource graphID : model.contexts()) {
348
349 final Map<IRI, IRI> terms = Maps.newHashMap();
350 for (final Statement stmt : model.filter(null, EVAL.DENOTED_BY, null, graphID)) {
351 terms.put((IRI) stmt.getSubject(), (IRI) stmt.getObject());
352 }
353
354 final Set<IRI> allPreds = Sets.newHashSet();
355 final Set<IRI> nbPreds = Sets.newHashSet();
356 final Set<IRI> pbPreds = Sets.newHashSet();
357 for (final Statement stmt : model.filter(null, RDF.TYPE, null, graphID)) {
358 if (stmt.getObject() instanceof IRI) {
359 final String ns = ((IRI) stmt.getObject()).getNamespace();
360 if (isFrameNS(ns)) {
361 final IRI pred = (IRI) stmt.getSubject();
362 allPreds.add(pred);
363 if (ns.equals("http://pikes.fbk.eu/ontologies/propbank#")) {
364 pbPreds.add(pred);
365 }
366 if (ns.equals("http://pikes.fbk.eu/ontologies/nombank#")) {
367 nbPreds.add(pred);
368 }
369 }
370 }
371 }
372 final Set<IRI> nomPreds = Sets.newHashSet();
373 nomPreds.addAll(nbPreds);
374 nomPreds.addAll(Sets.difference(allPreds, pbPreds));
375
376 for (final IRI pred : nomPreds) {
377 final IRI predTerm = terms.get(pred);
378 final List<Statement> stmts = Lists.newArrayList(model.filter(pred, null, null,
379 graphID));
380 IRI newSubj = pred;
381 for (final Statement stmt : stmts) {
382 final IRI argTerm = terms.get(stmt.getObject());
383 if (predTerm.equals(argTerm)) {
384 newSubj = (IRI) stmt.getObject();
385 break;
386 }
387 }
388 for (final Statement stmt : stmts) {
389 final boolean isFrameRole = isFrameNS(stmt.getPredicate().getNamespace());
390 final boolean isFrameType = !isFrameRole && stmt.getObject() instanceof IRI
391 && isFrameNS(((IRI) stmt.getObject()).getNamespace());
392 if (isFrameRole && !newSubj.equals(stmt.getObject())) {
393 model.add(newSubj, DUL_ASSOCIATED_WITH, stmt.getObject(), graphID);
394 }
395 if (isFrameRole || isFrameType || newSubj != pred) {
396 model.remove(stmt);
397 }
398 }
399 }
400 }
401 }
402
403 private static boolean isFrameNS(final String ns) {
404 return ns.equals("http://pikes.fbk.eu/ontologies/propbank#")
405 || ns.equals("http://pikes.fbk.eu/ontologies/nombank#")
406 || ns.equals("http://pikes.fbk.eu/ontologies/verbnet#")
407 || ns.equals("http://pikes.fbk.eu/ontologies/framenet#");
408 }
409
410 private static List<Value> split(final Value value, final Collection<String> terms) {
411 if (terms.size() <= 1) {
412 return ImmutableList.of(value);
413 } else {
414 final List<Value> values = Lists.newArrayListWithCapacity(terms.size());
415 for (final String term : terms) {
416 values.add(Statements.VALUE_FACTORY.createIRI(value + "_" + term));
417 }
418 return ImmutableList.copyOf(values);
419 }
420 }
421
422 private static void corefer(final QuadModel model, final Resource graph,
423 @Nullable final Collection<Value> values) {
424 if (values != null && values.size() > 1) {
425 for (final Value value1 : values) {
426 for (final Value value2 : values) {
427 if (Util.VALUE_ORDERING.compare(value1, value2) < 0) {
428 model.add((Resource) value1, OWL.SAMEAS, (Resource) value2, graph);
429 }
430 }
431 }
432 }
433 }
434
435 public static void main(final String... args) {
436
437 try {
438
439 final CommandLine cmd = CommandLine
440 .parser()
441 .withName("eval-converter")
442 .withHeader("Convert a tool output in the format used for the evaluation.")
443 .withOption("o", "output", "the output file", "FILE", Type.STRING, true,
444 false, true)
445 .withOption("f", "format", "the format (fred, pikes, gold)", "FMT",
446 Type.STRING, true, false, true)
447 .withOption("n", "replace-nominal",
448 "replaces nominal frames with association "
449 + " relations (for FRED compatibility)")
450 .withLogger(LoggerFactory.getLogger("eu.fbk"))
451 .parse(args);
452
453
454 final String format = cmd.getOptionValue("f", String.class).trim().toLowerCase();
455 final String outputFile = cmd.getOptionValue("o", String.class);
456 final List<String> inputFiles = cmd.getArgs(String.class);
457 final boolean replaceNominalFrames = cmd.hasOption("n");
458
459
460 Converter converter;
461 if (format.equalsIgnoreCase("fred")) {
462 converter = FRED_CONVERTER;
463 } else if (format.equalsIgnoreCase("gold")) {
464 converter = GOLD_CONVERTER;
465 } else if (format.equalsIgnoreCase("pikes")) {
466 converter = PIKES_CONVERTER;
467 } else {
468 throw new IllegalArgumentException("Unknown format: " + format);
469 }
470
471
472 final Map<String, String> namespaces = Maps.newHashMap();
473 final QuadModel input = QuadModel.create();
474 RDFSources.read(false, false, null, null, null, true,
475 inputFiles.toArray(new String[inputFiles.size()])).emit(
476 RDFHandlers.wrap(input, namespaces), 1);
477
478
479 final QuadModel output = converter.convert(input);
480
481
482 if (replaceNominalFrames) {
483 replaceNominalFrames(output);
484 }
485
486
487 final RDFHandler out = RDFHandlers.write(null, 1000, outputFile);
488 out.startRDF();
489 namespaces.put(DCTERMS.PREFIX, DCTERMS.NAMESPACE);
490 namespaces.put("pb", "http://pikes.fbk.eu/ontologies/propbank#");
491 namespaces.put("nb", "http://pikes.fbk.eu/ontologies/nombank#");
492 namespaces.put("vn", "http://pikes.fbk.eu/ontologies/verbnet#");
493 namespaces.put("fn", "http://pikes.fbk.eu/ontologies/framenet#");
494 namespaces.put("dul", "http://www.ontologydesignpatterns.org/ont/dul/DUL.owl#");
495 final Set<String> outputNS = Sets.newHashSet();
496 collectNS(outputNS, output);
497 for (final Map.Entry<String, String> entry : namespaces.entrySet()) {
498 if (!entry.getKey().isEmpty() && outputNS.contains(entry.getValue())) {
499 out.handleNamespace(entry.getKey(), entry.getValue());
500 }
501 }
502 for (final Statement stmt : Ordering.from(
503 Statements.statementComparator("cspo",
504 Statements.valueComparator(RDF.NAMESPACE))).sortedCopy(output)) {
505 out.handleStatement(stmt);
506 }
507 out.endRDF();
508
509 } catch (final Throwable ex) {
510
511 CommandLine.fail(ex);
512 }
513 }
514
515 private static void collectNS(final Collection<String> ns, final Iterable<Statement> stmts) {
516 for (final Statement stmt : stmts) {
517 collectNS(ns, stmt.getSubject());
518 collectNS(ns, stmt.getPredicate());
519 collectNS(ns, stmt.getObject());
520 collectNS(ns, stmt.getContext());
521 }
522 }
523
524 private static void collectNS(final Collection<String> ns, @Nullable final Value value) {
525 if (value instanceof IRI) {
526 ns.add(((IRI) value).getNamespace());
527 }
528 }
529
530 private static class Sentence {
531
532 private final String text;
533
534 private final int[] beginIndexes;
535
536 private final int[] endIndexes;
537
538 private final List<String> termList;
539
540 private final Set<String> termSet;
541
542 public Sentence(final String text) {
543
544 final int[] begins = new int[text.length()];
545 final int[] ends = new int[text.length()];
546 final List<String> termList = Lists.newArrayList();
547 final Set<String> termSet = Sets.newHashSet();
548 int count = 0;
549
550 final Set<String> ambiguousTerms = Sets.newHashSet();
551 boolean insideTerm = false;
552 for (int i = 0; i < text.length(); ++i) {
553 final char ch = text.charAt(i);
554 final boolean letter = Character.isLetter(ch) || ch == '-' || ch == '_';
555 if (letter && !insideTerm) {
556 begins[count] = i;
557 insideTerm = true;
558 } else if (!letter && insideTerm) {
559 ends[count] = i;
560 final String term = text.substring(begins[count], ends[count]);
561 termList.add(term);
562 if (!termSet.add(term)) {
563 ambiguousTerms.add(term);
564 }
565 ++count;
566 insideTerm = false;
567 }
568 }
569
570 for (final String term : ambiguousTerms) {
571 int index = 0;
572 termSet.remove(term);
573 for (int i = 0; i < termList.size(); ++i) {
574 if (termList.get(i).equals(term)) {
575 final String t = term + "_" + (++index);
576 termList.set(i, t);
577 termSet.add(t);
578 }
579 }
580 }
581
582 this.text = text;
583 this.beginIndexes = Arrays.copyOfRange(begins, 0, count);
584 this.endIndexes = Arrays.copyOfRange(ends, 0, count);
585 this.termList = termList;
586 this.termSet = termSet;
587 }
588
589 public String getText() {
590 return this.text;
591 }
592
593 public String getTerm(final String localName) {
594 int index = localName.length();
595 while (true) {
596 final String candidate = localName.substring(0, index);
597 for (final String term : this.termList) {
598 if (candidate.equalsIgnoreCase(term)) {
599 return term;
600 }
601 }
602 index = localName.lastIndexOf('_', index);
603 if (index < 0) {
604 throw new IllegalArgumentException("Cannot map " + localName
605 + " to a term\nterms: " + this.termSet);
606 }
607 }
608 }
609
610 public String getTerm(final int beginIndex, final int endIndex) {
611 final List<String> matches = Lists.newArrayList();
612 for (int i = 0; i < this.beginIndexes.length; ++i) {
613 if (beginIndex < this.endIndexes[i] && endIndex > this.beginIndexes[i]) {
614 matches.add(this.termList.get(i));
615 }
616 }
617 if (matches.size() == 0) {
618 throw new IllegalArgumentException("No term matching indexes " + beginIndex + ", "
619 + endIndex);
620 }
621 if (matches.size() > 1) {
622 throw new IllegalArgumentException("Multiple terms matching indexes " + beginIndex
623 + ", " + endIndex + "\ntext: " + this.text + "\nbegins: "
624 + Arrays.toString(this.beginIndexes) + "\nends: "
625 + Arrays.toString(this.endIndexes));
626 }
627 return matches.get(0);
628 }
629
630 }
631
632 }