1 package eu.fbk.dkm.pikes.resources;
2
3 import com.google.common.base.Objects;
4 import com.google.common.base.Preconditions;
5 import com.google.common.base.Throwables;
6 import com.google.common.collect.*;
7 import eu.fbk.rdfpro.util.IO;
8 import ixa.kaflib.Dep;
9 import ixa.kaflib.KAFDocument;
10 import ixa.kaflib.Term;
11
12 import javax.annotation.Nullable;
13 import java.io.BufferedReader;
14 import java.io.IOException;
15 import java.io.Reader;
16 import java.io.Writer;
17 import java.lang.reflect.Constructor;
18 import java.lang.reflect.InvocationTargetException;
19 import java.net.URL;
20 import java.util.*;
21
22 public class Lexicon<T extends Lexicon.Lexeme> extends AbstractSet<T> {
23
24 private final Map<String, T> idIndex;
25
26 private final Multimap<String, T> lemmaIndex;
27
28 private final Multimap<String, T> stemIndex;
29
30 public Lexicon(final Iterable<? extends T> lexemes) {
31
32 final ImmutableMap.Builder<String, T> idBuilder = ImmutableMap.builder();
33 final ImmutableMultimap.Builder<String, T> lemmaBuilder = ImmutableMultimap.builder();
34 final ImmutableMultimap.Builder<String, T> stemBuilder = ImmutableMultimap.builder();
35
36 for (final T lexeme : Ordering.natural().immutableSortedCopy(lexemes)) {
37 idBuilder.put(lexeme.getId(), lexeme);
38 for (final Token token : lexeme.getTokens()) {
39 if (token.getLemma() != null) {
40 lemmaBuilder.put(token.getLemma(), lexeme);
41 }
42 if (token.getStem() != null) {
43 stemBuilder.put(token.getStem(), lexeme);
44 }
45 }
46 }
47
48 this.idIndex = idBuilder.build();
49 this.lemmaIndex = lemmaBuilder.build();
50 this.stemIndex = stemBuilder.build();
51 }
52
53 public static <T extends Lexeme, L extends Lexicon<T>> L create(final Class<L> lexiconClass,
54 final Iterable<T> lexemes) {
55
56 for (final Constructor<?> constructor : lexiconClass.getDeclaredConstructors()) {
57 final Class<?>[] argTypes = constructor.getParameterTypes();
58 if (argTypes.length == 1 && argTypes[0].isAssignableFrom(List.class)) {
59 try {
60 constructor.setAccessible(true);
61 return lexiconClass
62 .cast(constructor.newInstance(ImmutableList.copyOf(lexemes)));
63 } catch (final InvocationTargetException ex) {
64 throw Throwables.propagate(ex.getCause());
65 } catch (final IllegalAccessException | InstantiationException ex) {
66 throw new IllegalArgumentException("Class " + lexiconClass.getName()
67 + " could not be instantiated", ex);
68 }
69 }
70 }
71 throw new IllegalArgumentException("No suitable constructor for class "
72 + lexiconClass.getName());
73 }
74
75 public static <T extends Lexeme, L extends Lexicon<T>> L readFrom(final Class<L> lexiconClass,
76 final Class<T> lexemeClass, final String location) throws IOException {
77
78
79 Reader reader = null;
80 try {
81 reader = IO.utf8Reader(IO.buffer(IO.read(location)));
82 } catch (final Throwable ex) {
83 final URL url = lexiconClass.getResource(location);
84 if (url == null) {
85 Throwables.propagateIfPossible(ex, IOException.class);
86 Throwables.propagate(ex);
87 }
88 reader = IO.utf8Reader(url.openStream());
89 }
90
91
92 try {
93 return readFrom(lexiconClass, lexemeClass, reader);
94 } finally {
95 reader.close();
96 }
97 }
98
99 public static <T extends Lexeme, L extends Lexicon<T>> L readFrom(final Class<L> lexiconClass,
100 final Class<T> lexemeClass, final Reader reader) throws IOException {
101 final List<T> lexemes = Lists.newArrayList();
102 final BufferedReader in = reader instanceof BufferedReader ? (BufferedReader) reader
103 : new BufferedReader(reader);
104 String line;
105 while ((line = in.readLine()) != null) {
106 T token = Lexeme.parse(lexemeClass, line);
107 if (token == null) {
108 continue;
109 }
110 lexemes.add(token);
111 }
112 return create(lexiconClass, lexemes);
113 }
114
115 @Override
116 public int size() {
117 return this.idIndex.size();
118 }
119
120 @Override
121 public boolean contains(final Object object) {
122 if (object instanceof Lexeme) {
123 final Lexeme lexeme = (Lexeme) object;
124 return this.idIndex.get(lexeme.getId()) == lexeme;
125 }
126 return false;
127 }
128
129 @Override
130 public Iterator<T> iterator() {
131 return this.idIndex.values().iterator();
132 }
133
134 public T get(final String id) {
135 return this.idIndex.get(id);
136 }
137
138 public Set<T> match(final KAFDocument document, final Term term) {
139 return ImmutableSet.copyOf(match(document, ImmutableSet.of(term)).get(term));
140 }
141
142 public Multimap<Term, T> match(final KAFDocument document, final Iterable<Term> terms) {
143 Preconditions.checkNotNull(document);
144 final Set<Term> termSet = ImmutableSet.copyOf(terms);
145 final Multimap<Term, T> result = HashMultimap.create();
146 for (final Term term : termSet) {
147 final String lemma = term.getLemma();
148 final String stem = Stemming.stem(null, lemma);
149 for (final T lexeme : ImmutableSet.copyOf(Iterables.concat(
150 this.lemmaIndex.get(term.getLemma()), this.stemIndex.get(stem)))) {
151 if (lexeme.match(document, termSet, term)) {
152 result.put(term, lexeme);
153 }
154 }
155 }
156 return result;
157 }
158
159 public void writeTo(final String location) throws IOException {
160 try (Writer writer = IO.utf8Writer(IO.buffer(IO.write(location)))) {
161 writeTo(writer);
162 }
163 }
164
165 public <A extends Appendable> A writeTo(final A out) throws IOException {
166 for (final Lexeme lexeme : this.idIndex.values()) {
167 lexeme.toString(out);
168 out.append('\n');
169 }
170 return out;
171 }
172
173 public static class Lexeme implements Comparable<Lexeme> {
174
175 private static final Map<Class<?>, Constructor<?>> CACHED_CONSTRUCTORS = Maps
176 .newConcurrentMap();
177
178 private final String id;
179
180 private final List<Token> tokens;
181
182 public Lexeme(final String id, final Iterable<Token> tokens) {
183 this.id = Preconditions.checkNotNull(id);
184 this.tokens = ImmutableList.copyOf(tokens);
185 }
186
187 public static <T extends Lexeme> T create(final Class<T> lexemeClass, final String id,
188 final Iterable<Token> tokens, @Nullable final Map<String, String> properties) {
189
190
191 final List<Token> actualTokens = ImmutableList.copyOf(tokens);
192 final Map<String, String> actualProperties = properties == null ? ImmutableMap.of()
193 : ImmutableMap.copyOf(properties);
194
195
196 Constructor<?> constructor = CACHED_CONSTRUCTORS.get(lexemeClass);
197 if (constructor == null) {
198 for (final Constructor<?> candidate : lexemeClass.getDeclaredConstructors()) {
199 final Class<?>[] argTypes = candidate.getParameterTypes();
200 if (argTypes.length >= 2 && argTypes[0].isAssignableFrom(String.class)
201 && argTypes[1].isAssignableFrom(List.class)) {
202 if (argTypes.length == 3 && argTypes[2].isAssignableFrom(Map.class)) {
203 constructor = candidate;
204 break;
205 }
206 else if (argTypes.length == 2 && constructor == null) {
207 constructor = candidate;
208 }
209 }
210 }
211 if (constructor == null) {
212 throw new IllegalArgumentException("No suitable constructor for class "
213 + lexemeClass.getName());
214 }
215 constructor.setAccessible(true);
216 CACHED_CONSTRUCTORS.put(lexemeClass, constructor);
217 }
218
219 try {
220
221 if (constructor.getParameterCount() == 2) {
222 return lexemeClass.cast(constructor.newInstance(id, actualTokens));
223 }
224 else {
225 return lexemeClass.cast(constructor.newInstance(id, actualTokens,
226 actualProperties));
227 }
228 } catch (final InvocationTargetException ex) {
229 throw Throwables.propagate(ex.getCause());
230 } catch (final IllegalAccessException | InstantiationException ex) {
231 throw new IllegalArgumentException("Class " + lexemeClass.getName()
232 + " could not be instantiated", ex);
233 }
234 }
235
236 @Nullable
237 public static <T extends Lexeme> T parse(final Class<T> lexemeClass, final String string) {
238 final String[] fields = string.split("\t");
239 if (fields.length < 2) {
240 return null;
241 }
242 final String id = fields[0].trim();
243 final List<Token> tokens = Lists.newArrayList();
244 for (final String tokenStr : fields[1].substring("tokens=".length()).split(",")) {
245 tokens.add(Token.parse(tokenStr.trim(), id));
246 }
247 final ImmutableMap.Builder<String, String> builder = ImmutableMap.builder();
248 for (int i = 2; i < fields.length; ++i) {
249 final String fieldStr = fields[i];
250 final int index = fieldStr.indexOf('=');
251 if (index < 0) {
252 final String key = fieldStr.trim().intern();
253 final String value = "true";
254 builder.put(key, value);
255 }
256 else {
257 final String key = fieldStr.substring(0, index).trim().intern();
258 final String value = fieldStr.substring(index + 1).trim().intern();
259 builder.put(key, value);
260 }
261 }
262 return create(lexemeClass, id, tokens, builder.build());
263 }
264
265 protected Map<String, String> getProperties() {
266 return ImmutableMap.of();
267 }
268
269 public final String getId() {
270 return this.id;
271 }
272
273 public final List<Token> getTokens() {
274 return this.tokens;
275 }
276
277 public final boolean match(final KAFDocument document, final Iterable<Term> terms,
278 final Term head) {
279 final Term[][] solutions = matchRecursive(document, terms, head);
280 if (solutions != null) {
281 outer:
282 for (final Term[] solution : solutions) {
283 for (int i = 0; i < this.tokens.size(); ++i) {
284 if (solution[i] == null) {
285 continue outer;
286 }
287 }
288 return true;
289 }
290 }
291 return false;
292 }
293
294 @Nullable
295 private Term[][] matchRecursive(final KAFDocument document, final Iterable<Term> terms,
296 final Term head) {
297
298
299 int combinations = 0;
300
301
302 final List<Integer> indexes = Lists.newArrayList();
303 for (int i = 0; i < this.tokens.size(); ++i) {
304 if (this.tokens.get(i).match(head)) {
305 indexes.add(i);
306 ++combinations;
307 }
308 }
309 if (combinations == 0) {
310 return null;
311 }
312
313
314 final List<Dep> deps = document.getDepsFromTerm(head);
315 final Term[][][] childSolutions = new Term[deps.size()][][];
316 for (int i = 0; i < deps.size(); ++i) {
317 final Term child = deps.get(i).getTo();
318 if (Iterables.contains(terms, child)) {
319 childSolutions[i] = matchRecursive(document, terms, child);
320 combinations *= childSolutions[i] == null ? 1 : childSolutions[i].length + 1;
321 }
322 }
323
324
325 final Term[] solution = new Term[this.tokens.size()];
326 final List<Term[]> solutions = Lists.newArrayList();
327 outer:
328 for (int i = 0; i < combinations; ++i) {
329 Arrays.fill(solution, null);
330 solution[indexes.get(i % indexes.size())] = head;
331 int index = i / indexes.size();
332 for (int j = 0; j < deps.size(); ++j) {
333 if (childSolutions[j] != null) {
334 final int len = childSolutions[j].length;
335 final int num = index % (len + 1);
336 index = index / (len + 1);
337 if (num != len) {
338 final Term[] childSolution = childSolutions[j][num];
339 for (int k = 0; k < this.tokens.size(); ++k) {
340 if (childSolution[k] != null) {
341 if (solution[k] != null) {
342 continue outer;
343 }
344 solution[k] = childSolution[k];
345 }
346 }
347 }
348 }
349 }
350 int offset = Integer.MIN_VALUE;
351 for (int k = 0; k < this.tokens.size(); ++k) {
352 final Term term = solution[k];
353 if (term != null) {
354 if (term.getOffset() < offset) {
355 continue outer;
356 }
357 offset = term.getOffset();
358 }
359 }
360 solutions.add(solution.clone());
361 }
362 return solutions.toArray(new Term[solutions.size()][]);
363 }
364
365 @Override
366 public final int compareTo(final Lexeme lexeme) {
367 return this.id.compareTo(lexeme.id);
368 }
369
370 @Override
371 public final boolean equals(final Object object) {
372 if (object == this) {
373 return true;
374 }
375 if (!(object instanceof Lexeme)) {
376 return false;
377 }
378 final Lexeme other = (Lexeme) object;
379 return this.id.equals(other.id);
380 }
381
382 @Override
383 public final int hashCode() {
384 return this.id.hashCode();
385 }
386
387 public final <T extends Appendable> T toString(final T out) throws IOException {
388 out.append(this.id);
389 out.append("\ttokens=");
390 for (int i = 0; i < this.tokens.size(); ++i) {
391 out.append(i == 0 ? "" : ",");
392 this.tokens.get(i).toString(out);
393 }
394 final Map<String, String> properties = getProperties();
395 for (final String name : Ordering.natural().immutableSortedCopy(properties.keySet())) {
396 final String value = properties.get(name);
397 out.append('\t').append(name).append('=').append(value);
398 }
399 return out;
400 }
401
402 @Override
403 public final String toString() {
404 try {
405 return toString(new StringBuilder()).toString();
406 } catch (final IOException ex) {
407 throw new Error(ex);
408 }
409 }
410
411 }
412
413 public static final class Token {
414
415 public static final Token WILDCARD = new Token(null, null, null);
416
417 @Nullable
418 private final String lemma;
419
420 @Nullable
421 private final String stem;
422
423 @Nullable
424 private final String pos;
425
426 private Token(@Nullable final String lemma, @Nullable final String stem,
427 @Nullable final String pos) {
428 this.lemma = lemma == null ? null : lemma.toLowerCase().intern();
429 this.stem = stem == null ? null : stem.toLowerCase().intern();
430 this.pos = pos == null ? null : pos.toUpperCase().intern();
431 }
432
433 public static Token create(@Nullable final String lemma, @Nullable final String stem,
434 @Nullable final String pos) {
435 return lemma == null && stem == null && pos == null ? WILDCARD
436 : new Token(lemma, stem, pos);
437 }
438
439 public static Token parse(final String string) {
440 return parse(string, null);
441 }
442
443 public static Token parse(final String string, @Nullable String altLemma) {
444 final String[] fields = string.split("\\|");
445 String lemma = fields[0].trim();
446 final String stem = fields[1].trim();
447 final String pos = fields[2].trim();
448 if (".".equals(lemma)) {
449 lemma = altLemma;
450 }
451 return create("*".equals(lemma) ? null : lemma, "*".equals(stem) ? null : stem,
452 "*".equals(pos) ? null : pos);
453 }
454
455 @Nullable
456 public String getLemma() {
457 return this.lemma;
458 }
459
460 @Nullable
461 public String getStem() {
462 return this.stem;
463 }
464
465 @Nullable
466 public String getPos() {
467 return this.pos;
468 }
469
470 public boolean match(@Nullable final Term term) {
471 return term != null
472 && (this.pos == null || this.pos.equalsIgnoreCase(term.getPos()) || this.pos
473 .equals(term.getMorphofeat()))
474 && (this.lemma == null || this.lemma.equalsIgnoreCase(term.getLemma()))
475 && (this.stem == null || this.stem.equalsIgnoreCase(Stemming.stem(null,
476 term.getStr())));
477 }
478
479 @Override
480 public boolean equals(final Object object) {
481 if (object == this) {
482 return true;
483 }
484 if (!(object instanceof Token)) {
485 return false;
486 }
487 final Token other = (Token) object;
488 return Objects.equal(this.lemma, other.lemma) && Objects.equal(this.stem, other.stem)
489 && Objects.equal(this.pos, other.pos);
490 }
491
492 @Override
493 public int hashCode() {
494 return Objects.hashCode(this.lemma, this.stem, this.pos);
495 }
496
497 public <T extends Appendable> T toString(final T out) throws IOException {
498 out.append(Objects.firstNonNull(this.lemma, "*"));
499 out.append("|");
500 out.append(Objects.firstNonNull(this.stem, "*"));
501 out.append("|");
502 out.append(Objects.firstNonNull(this.pos, "*"));
503 return out;
504 }
505
506 @Override
507 public String toString() {
508 try {
509 return toString(new StringBuilder()).toString();
510 } catch (final IOException ex) {
511 throw new Error(ex);
512 }
513 }
514
515 }
516
517 }