1   package eu.fbk.dkm.pikes.resources;
2   
3   import com.google.common.base.Objects;
4   import com.google.common.base.Preconditions;
5   import com.google.common.base.Throwables;
6   import com.google.common.collect.*;
7   import eu.fbk.rdfpro.util.IO;
8   import ixa.kaflib.Dep;
9   import ixa.kaflib.KAFDocument;
10  import ixa.kaflib.Term;
11  
12  import javax.annotation.Nullable;
13  import java.io.BufferedReader;
14  import java.io.IOException;
15  import java.io.Reader;
16  import java.io.Writer;
17  import java.lang.reflect.Constructor;
18  import java.lang.reflect.InvocationTargetException;
19  import java.net.URL;
20  import java.util.*;
21  
22  public class Lexicon<T extends Lexicon.Lexeme> extends AbstractSet<T> {
23  
24  	private final Map<String, T> idIndex;
25  
26  	private final Multimap<String, T> lemmaIndex;
27  
28  	private final Multimap<String, T> stemIndex;
29  
30  	public Lexicon(final Iterable<? extends T> lexemes) {
31  
32  		final ImmutableMap.Builder<String, T> idBuilder = ImmutableMap.builder();
33  		final ImmutableMultimap.Builder<String, T> lemmaBuilder = ImmutableMultimap.builder();
34  		final ImmutableMultimap.Builder<String, T> stemBuilder = ImmutableMultimap.builder();
35  
36  		for (final T lexeme : Ordering.natural().immutableSortedCopy(lexemes)) {
37  			idBuilder.put(lexeme.getId(), lexeme);
38  			for (final Token token : lexeme.getTokens()) {
39  				if (token.getLemma() != null) {
40  					lemmaBuilder.put(token.getLemma(), lexeme);
41  				}
42  				if (token.getStem() != null) {
43  					stemBuilder.put(token.getStem(), lexeme);
44  				}
45  			}
46  		}
47  
48  		this.idIndex = idBuilder.build();
49  		this.lemmaIndex = lemmaBuilder.build();
50  		this.stemIndex = stemBuilder.build();
51  	}
52  
53  	public static <T extends Lexeme, L extends Lexicon<T>> L create(final Class<L> lexiconClass,
54  																	final Iterable<T> lexemes) {
55  
56  		for (final Constructor<?> constructor : lexiconClass.getDeclaredConstructors()) {
57  			final Class<?>[] argTypes = constructor.getParameterTypes();
58  			if (argTypes.length == 1 && argTypes[0].isAssignableFrom(List.class)) {
59  				try {
60  					constructor.setAccessible(true);
61  					return lexiconClass
62  							.cast(constructor.newInstance(ImmutableList.copyOf(lexemes)));
63  				} catch (final InvocationTargetException ex) {
64  					throw Throwables.propagate(ex.getCause());
65  				} catch (final IllegalAccessException | InstantiationException ex) {
66  					throw new IllegalArgumentException("Class " + lexiconClass.getName()
67  							+ " could not be instantiated", ex);
68  				}
69  			}
70  		}
71  		throw new IllegalArgumentException("No suitable constructor for class "
72  				+ lexiconClass.getName());
73  	}
74  
75  	public static <T extends Lexeme, L extends Lexicon<T>> L readFrom(final Class<L> lexiconClass,
76  																	  final Class<T> lexemeClass, final String location) throws IOException {
77  
78  		// Try to open a file at the location specified, falling back to search on the classpath
79  		Reader reader = null;
80  		try {
81  			reader = IO.utf8Reader(IO.buffer(IO.read(location)));
82  		} catch (final Throwable ex) {
83  			final URL url = lexiconClass.getResource(location);
84  			if (url == null) {
85  				Throwables.propagateIfPossible(ex, IOException.class);
86  				Throwables.propagate(ex);
87  			}
88  			reader = IO.utf8Reader(url.openStream());
89  		}
90  
91  		// Read the lexicon from the opened reader
92  		try {
93  			return readFrom(lexiconClass, lexemeClass, reader);
94  		} finally {
95  			reader.close();
96  		}
97  	}
98  
99  	public static <T extends Lexeme, L extends Lexicon<T>> L readFrom(final Class<L> lexiconClass,
100 																	  final Class<T> lexemeClass, final Reader reader) throws IOException {
101 		final List<T> lexemes = Lists.newArrayList();
102 		final BufferedReader in = reader instanceof BufferedReader ? (BufferedReader) reader
103 				: new BufferedReader(reader);
104 		String line;
105 		while ((line = in.readLine()) != null) {
106 			T token = Lexeme.parse(lexemeClass, line);
107 			if (token == null) {
108 				continue;
109 			}
110 			lexemes.add(token);
111 		}
112 		return create(lexiconClass, lexemes);
113 	}
114 
115 	@Override
116 	public int size() {
117 		return this.idIndex.size();
118 	}
119 
120 	@Override
121 	public boolean contains(final Object object) {
122 		if (object instanceof Lexeme) {
123 			final Lexeme lexeme = (Lexeme) object;
124 			return this.idIndex.get(lexeme.getId()) == lexeme;
125 		}
126 		return false;
127 	}
128 
129 	@Override
130 	public Iterator<T> iterator() {
131 		return this.idIndex.values().iterator();
132 	}
133 
134 	public T get(final String id) {
135 		return this.idIndex.get(id);
136 	}
137 
138 	public Set<T> match(final KAFDocument document, final Term term) {
139 		return ImmutableSet.copyOf(match(document, ImmutableSet.of(term)).get(term));
140 	}
141 
142 	public Multimap<Term, T> match(final KAFDocument document, final Iterable<Term> terms) {
143 		Preconditions.checkNotNull(document);
144 		final Set<Term> termSet = ImmutableSet.copyOf(terms);
145 		final Multimap<Term, T> result = HashMultimap.create();
146 		for (final Term term : termSet) {
147 			final String lemma = term.getLemma();
148 			final String stem = Stemming.stem(null, lemma);
149 			for (final T lexeme : ImmutableSet.copyOf(Iterables.concat(
150 					this.lemmaIndex.get(term.getLemma()), this.stemIndex.get(stem)))) {
151 				if (lexeme.match(document, termSet, term)) {
152 					result.put(term, lexeme);
153 				}
154 			}
155 		}
156 		return result;
157 	}
158 
159 	public void writeTo(final String location) throws IOException {
160 		try (Writer writer = IO.utf8Writer(IO.buffer(IO.write(location)))) {
161 			writeTo(writer);
162 		}
163 	}
164 
165 	public <A extends Appendable> A writeTo(final A out) throws IOException {
166 		for (final Lexeme lexeme : this.idIndex.values()) {
167 			lexeme.toString(out);
168 			out.append('\n');
169 		}
170 		return out;
171 	}
172 
173 	public static class Lexeme implements Comparable<Lexeme> {
174 
175 		private static final Map<Class<?>, Constructor<?>> CACHED_CONSTRUCTORS = Maps
176 				.newConcurrentMap();
177 
178 		private final String id;
179 
180 		private final List<Token> tokens;
181 
182 		public Lexeme(final String id, final Iterable<Token> tokens) {
183 			this.id = Preconditions.checkNotNull(id);
184 			this.tokens = ImmutableList.copyOf(tokens);
185 		}
186 
187 		public static <T extends Lexeme> T create(final Class<T> lexemeClass, final String id,
188 												  final Iterable<Token> tokens, @Nullable final Map<String, String> properties) {
189 
190 			// Normalize tokens and properties
191 			final List<Token> actualTokens = ImmutableList.copyOf(tokens);
192 			final Map<String, String> actualProperties = properties == null ? ImmutableMap.of()
193 					: ImmutableMap.copyOf(properties);
194 
195 			// Locate a suitable constructor, using a cache to speed up the process
196 			Constructor<?> constructor = CACHED_CONSTRUCTORS.get(lexemeClass);
197 			if (constructor == null) {
198 				for (final Constructor<?> candidate : lexemeClass.getDeclaredConstructors()) {
199 					final Class<?>[] argTypes = candidate.getParameterTypes();
200 					if (argTypes.length >= 2 && argTypes[0].isAssignableFrom(String.class)
201 							&& argTypes[1].isAssignableFrom(List.class)) {
202 						if (argTypes.length == 3 && argTypes[2].isAssignableFrom(Map.class)) {
203 							constructor = candidate;
204 							break;
205 						}
206 						else if (argTypes.length == 2 && constructor == null) {
207 							constructor = candidate;
208 						}
209 					}
210 				}
211 				if (constructor == null) {
212 					throw new IllegalArgumentException("No suitable constructor for class "
213 							+ lexemeClass.getName());
214 				}
215 				constructor.setAccessible(true);
216 				CACHED_CONSTRUCTORS.put(lexemeClass, constructor);
217 			}
218 
219 			try {
220 				// Invoke the constructor and return the created object
221 				if (constructor.getParameterCount() == 2) {
222 					return lexemeClass.cast(constructor.newInstance(id, actualTokens));
223 				}
224 				else {
225 					return lexemeClass.cast(constructor.newInstance(id, actualTokens,
226 							actualProperties));
227 				}
228 			} catch (final InvocationTargetException ex) {
229 				throw Throwables.propagate(ex.getCause());
230 			} catch (final IllegalAccessException | InstantiationException ex) {
231 				throw new IllegalArgumentException("Class " + lexemeClass.getName()
232 						+ " could not be instantiated", ex);
233 			}
234 		}
235 
236 		@Nullable
237 		public static <T extends Lexeme> T parse(final Class<T> lexemeClass, final String string) {
238 			final String[] fields = string.split("\t");
239 			if (fields.length < 2) {
240 				return null;
241 			}
242 			final String id = fields[0].trim();
243 			final List<Token> tokens = Lists.newArrayList();
244 			for (final String tokenStr : fields[1].substring("tokens=".length()).split(",")) {
245 				tokens.add(Token.parse(tokenStr.trim(), id));
246 			}
247 			final ImmutableMap.Builder<String, String> builder = ImmutableMap.builder();
248 			for (int i = 2; i < fields.length; ++i) {
249 				final String fieldStr = fields[i];
250 				final int index = fieldStr.indexOf('=');
251 				if (index < 0) {
252 					final String key = fieldStr.trim().intern();
253 					final String value = "true";
254 					builder.put(key, value);
255 				}
256 				else {
257 					final String key = fieldStr.substring(0, index).trim().intern();
258 					final String value = fieldStr.substring(index + 1).trim().intern();
259 					builder.put(key, value);
260 				}
261 			}
262 			return create(lexemeClass, id, tokens, builder.build());
263 		}
264 
265 		protected Map<String, String> getProperties() {
266 			return ImmutableMap.of(); // could be overridden by subclasses to add other properties
267 		}
268 
269 		public final String getId() {
270 			return this.id;
271 		}
272 
273 		public final List<Token> getTokens() {
274 			return this.tokens;
275 		}
276 
277 		public final boolean match(final KAFDocument document, final Iterable<Term> terms,
278 								   final Term head) {
279 			final Term[][] solutions = matchRecursive(document, terms, head);
280 			if (solutions != null) {
281 				outer:
282 				for (final Term[] solution : solutions) {
283 					for (int i = 0; i < this.tokens.size(); ++i) {
284 						if (solution[i] == null) {
285 							continue outer;
286 						}
287 					}
288 					return true;
289 				}
290 			}
291 			return false;
292 		}
293 
294 		@Nullable
295 		private Term[][] matchRecursive(final KAFDocument document, final Iterable<Term> terms,
296 										final Term head) {
297 
298 			// Use a counter to keep track of possible combinations
299 			int combinations = 0;
300 
301 			// Try to match the head against the different tokens. Abort if not possible
302 			final List<Integer> indexes = Lists.newArrayList();
303 			for (int i = 0; i < this.tokens.size(); ++i) {
304 				if (this.tokens.get(i).match(head)) {
305 					indexes.add(i);
306 					++combinations;
307 				}
308 			}
309 			if (combinations == 0) {
310 				return null;
311 			}
312 
313 			// Match children
314 			final List<Dep> deps = document.getDepsFromTerm(head);
315 			final Term[][][] childSolutions = new Term[deps.size()][][];
316 			for (int i = 0; i < deps.size(); ++i) {
317 				final Term child = deps.get(i).getTo();
318 				if (Iterables.contains(terms, child)) {
319 					childSolutions[i] = matchRecursive(document, terms, child);
320 					combinations *= childSolutions[i] == null ? 1 : childSolutions[i].length + 1;
321 				}
322 			}
323 
324 			// Combine solutions
325 			final Term[] solution = new Term[this.tokens.size()];
326 			final List<Term[]> solutions = Lists.newArrayList();
327 			outer:
328 			for (int i = 0; i < combinations; ++i) {
329 				Arrays.fill(solution, null);
330 				solution[indexes.get(i % indexes.size())] = head;
331 				int index = i / indexes.size();
332 				for (int j = 0; j < deps.size(); ++j) {
333 					if (childSolutions[j] != null) {
334 						final int len = childSolutions[j].length;
335 						final int num = index % (len + 1);
336 						index = index / (len + 1);
337 						if (num != len) {
338 							final Term[] childSolution = childSolutions[j][num];
339 							for (int k = 0; k < this.tokens.size(); ++k) {
340 								if (childSolution[k] != null) {
341 									if (solution[k] != null) {
342 										continue outer;
343 									}
344 									solution[k] = childSolution[k];
345 								}
346 							}
347 						}
348 					}
349 				}
350 				int offset = Integer.MIN_VALUE;
351 				for (int k = 0; k < this.tokens.size(); ++k) {
352 					final Term term = solution[k];
353 					if (term != null) {
354 						if (term.getOffset() < offset) {
355 							continue outer;
356 						}
357 						offset = term.getOffset();
358 					}
359 				}
360 				solutions.add(solution.clone());
361 			}
362 			return solutions.toArray(new Term[solutions.size()][]);
363 		}
364 
365 		@Override
366 		public final int compareTo(final Lexeme lexeme) {
367 			return this.id.compareTo(lexeme.id);
368 		}
369 
370 		@Override
371 		public final boolean equals(final Object object) {
372 			if (object == this) {
373 				return true;
374 			}
375 			if (!(object instanceof Lexeme)) {
376 				return false;
377 			}
378 			final Lexeme other = (Lexeme) object;
379 			return this.id.equals(other.id);
380 		}
381 
382 		@Override
383 		public final int hashCode() {
384 			return this.id.hashCode();
385 		}
386 
387 		public final <T extends Appendable> T toString(final T out) throws IOException {
388 			out.append(this.id);
389 			out.append("\ttokens=");
390 			for (int i = 0; i < this.tokens.size(); ++i) {
391 				out.append(i == 0 ? "" : ",");
392 				this.tokens.get(i).toString(out);
393 			}
394 			final Map<String, String> properties = getProperties();
395 			for (final String name : Ordering.natural().immutableSortedCopy(properties.keySet())) {
396 				final String value = properties.get(name);
397 				out.append('\t').append(name).append('=').append(value);
398 			}
399 			return out;
400 		}
401 
402 		@Override
403 		public final String toString() {
404 			try {
405 				return toString(new StringBuilder()).toString();
406 			} catch (final IOException ex) {
407 				throw new Error(ex);
408 			}
409 		}
410 
411 	}
412 
413 	public static final class Token {
414 
415 		public static final Token WILDCARD = new Token(null, null, null);
416 
417 		@Nullable
418 		private final String lemma;
419 
420 		@Nullable
421 		private final String stem;
422 
423 		@Nullable
424 		private final String pos;
425 
426 		private Token(@Nullable final String lemma, @Nullable final String stem,
427 					  @Nullable final String pos) {
428 			this.lemma = lemma == null ? null : lemma.toLowerCase().intern();
429 			this.stem = stem == null ? null : stem.toLowerCase().intern();
430 			this.pos = pos == null ? null : pos.toUpperCase().intern();
431 		}
432 
433 		public static Token create(@Nullable final String lemma, @Nullable final String stem,
434 								   @Nullable final String pos) {
435 			return lemma == null && stem == null && pos == null ? WILDCARD //
436 					: new Token(lemma, stem, pos);
437 		}
438 
439 		public static Token parse(final String string) {
440 			return parse(string, null);
441 		}
442 
443 		public static Token parse(final String string, @Nullable String altLemma) {
444 			final String[] fields = string.split("\\|");
445 			String lemma = fields[0].trim();
446 			final String stem = fields[1].trim();
447 			final String pos = fields[2].trim();
448 			if (".".equals(lemma)) {
449 				lemma = altLemma;
450 			}
451 			return create("*".equals(lemma) ? null : lemma, "*".equals(stem) ? null : stem,
452 					"*".equals(pos) ? null : pos);
453 		}
454 
455 		@Nullable
456 		public String getLemma() {
457 			return this.lemma;
458 		}
459 
460 		@Nullable
461 		public String getStem() {
462 			return this.stem;
463 		}
464 
465 		@Nullable
466 		public String getPos() {
467 			return this.pos;
468 		}
469 
470 		public boolean match(@Nullable final Term term) {
471 			return term != null
472 					&& (this.pos == null || this.pos.equalsIgnoreCase(term.getPos()) || this.pos
473 					.equals(term.getMorphofeat()))
474 					&& (this.lemma == null || this.lemma.equalsIgnoreCase(term.getLemma()))
475 					&& (this.stem == null || this.stem.equalsIgnoreCase(Stemming.stem(null,
476 					term.getStr())));
477 		}
478 
479 		@Override
480 		public boolean equals(final Object object) {
481 			if (object == this) {
482 				return true;
483 			}
484 			if (!(object instanceof Token)) {
485 				return false;
486 			}
487 			final Token other = (Token) object;
488 			return Objects.equal(this.lemma, other.lemma) && Objects.equal(this.stem, other.stem)
489 					&& Objects.equal(this.pos, other.pos);
490 		}
491 
492 		@Override
493 		public int hashCode() {
494 			return Objects.hashCode(this.lemma, this.stem, this.pos);
495 		}
496 
497 		public <T extends Appendable> T toString(final T out) throws IOException {
498 			out.append(Objects.firstNonNull(this.lemma, "*"));
499 			out.append("|");
500 			out.append(Objects.firstNonNull(this.stem, "*"));
501 			out.append("|");
502 			out.append(Objects.firstNonNull(this.pos, "*"));
503 			return out;
504 		}
505 
506 		@Override
507 		public String toString() {
508 			try {
509 				return toString(new StringBuilder()).toString();
510 			} catch (final IOException ex) {
511 				throw new Error(ex);
512 			}
513 		}
514 
515 	}
516 
517 }