1   package eu.fbk.dkm.pikes.raid;
2   
3   import com.google.common.collect.HashMultimap;
4   import com.google.common.collect.Multimap;
5   import eu.fbk.dkm.pikes.naflib.Corpus;
6   import eu.fbk.dkm.pikes.resources.*;
7   import eu.fbk.utils.analysis.stemmer.Stemmer;
8   import eu.fbk.utils.analysis.stemmer.StemmerFactory;
9   import eu.fbk.utils.core.ArrayUtils;
10  import eu.fbk.utils.core.CommandLine;
11  import ixa.kaflib.*;
12  import org.slf4j.LoggerFactory;
13  
14  import javax.annotation.Nullable;
15  import java.io.*;
16  import java.nio.file.Path;
17  import java.util.*;
18  
19  /**
20   * Created by alessio on 17/04/15.
21   */
22  
23  public class CreateTrainingForExpression {
24  
25  	private static final org.slf4j.Logger LOGGER = LoggerFactory.getLogger(CreateTrainingForExpression.class);
26  	private static final String DEFAULT_LABEL = "gold";
27  
28  	private static SenticNet senticNet;
29  	private static SubjectivityLexicon subjectivityLexicon;
30  	private static Stemmer stemmer;
31  	private static Intensities intensities;
32  
33  	public enum Features {
34  		STANFORD,
35  		SENTICNET,
36  		SUBJLEXICON,
37  		INTENSITY,
38  		WORDNET,
39  		SENTIWORDNET,
40  		MOSCHITTI,
41  		SST,
42  		ENTITIES,
43  		STEM,
44  		POS,
45  		DEP,
46  		SRL
47  	}
48  
49  	private static boolean MAJORITY = false;
50  
51  	private static boolean FEATS_STANFORD = false;
52  	private static boolean FEATS_SENTICNET = MAJORITY;
53  	private static boolean FEATS_SUBJLEXICON = MAJORITY;
54  	private static boolean FEATS_INTENSITY = MAJORITY;
55  	private static boolean FEATS_WORDNET = MAJORITY;
56  	private static boolean FEATS_SENTIWORDNET = false;
57  
58  	private static boolean FEATS_MOSCHITTI = true;
59  
60  	private static boolean FEATS_SST = true;
61  	private static boolean FEATS_ENTITIES = true;
62  	private static boolean FEATS_STEM = true;
63  	private static boolean FEATS_POS = true;
64  	private static boolean FEATS_DEP = MAJORITY;
65  	private static boolean FEATS_SRL = MAJORITY;
66  
67  	private static Long DEFAULT_SEED = 2l;
68  	private static String DEFAULT_CLASSIFICATION_LABEL = "_CLASS";
69  	private static Integer DEFAULT_SLOT_SIZE = 1;
70  	private static Float DEFAULT_SPLIT = 0.75f;
71  
72  	private static String DEFAULT_NONE = "-";
73  	private static String DEFAULT_YES = "Y";
74  
75  	private static HashSet<String> DOUBLE_FEATURES = new HashSet<>();
76  
77  	static {
78  		DOUBLE_FEATURES.add("LEMMA");
79  		DOUBLE_FEATURES.add("P");
80  		DOUBLE_FEATURES.add("E");
81  		DOUBLE_FEATURES.add("SST");
82  	}
83  
84  	private static HashSet<String> TRIPLE_FEATURES = new HashSet<>();
85  
86  	static {
87  		TRIPLE_FEATURES.add("LEMMA");
88  		TRIPLE_FEATURES.add("P");
89  	}
90  
91  	public enum Type {
92  		MALLET, MALLET_WINDOW, YAMCHA, CRFSUITE, WAPITI
93  	}
94  
95  	public enum OutputType {
96  		SINGLE, COMPLETE
97  	}
98  
99  	static Type DEFAULT_TYPE = Type.CRFSUITE;
100 
101 	public static ArrayList<ArrayList<LinkedHashMap<String, String>>> extractFeats(KAFDocument document, String[] labels, Set<String> hypernyms, boolean skipEmpty) {
102 		HashSet<Term> opinionTerms = new HashSet<>();
103 		HashMap<Term, String> stanfordTerms = new HashMap<>();
104 		HashMap<Term, String> entityTerms = new HashMap<>();
105 
106 		// Preprocessing srl
107 		HashMultimap<Term, String> srlFeatures = HashMultimap.create();
108 		String featName;
109 		if (FEATS_SRL) {
110 			for (Predicate predicate : document.getPredicates()) {
111 				for (Term term : predicate.getTerms()) {
112 					srlFeatures.put(term, "isPredicate");
113 					for (ExternalRef externalRef : predicate.getExternalRefs()) {
114 						if (externalRef.getReference().length() == 0) {
115 							continue;
116 						}
117 						featName = "isPredicate." + externalRef.getResource() + "." + externalRef.getReference();
118 						srlFeatures.put(term, featName);
119 					}
120 					for (Predicate.Role role : predicate.getRoles()) {
121 						for (ExternalRef externalRef : role.getExternalRefs()) {
122 							if (externalRef.getReference().length() == 0) {
123 								continue;
124 							}
125 //							featName = "hasRole." + externalRef.getResource() + "." + externalRef.getReference();
126 //							srlFeatures.put(term, featName);
127 							featName = "hasRole." + externalRef.getReference();
128 							srlFeatures.put(term, featName);
129 
130 							for (Term roleTerm : role.getTerms()) {
131 								featName = "isRole";
132 								srlFeatures.put(roleTerm, featName);
133 //								featName = "isRole." + externalRef.getResource() + "." + externalRef.getReference();
134 //								srlFeatures.put(roleTerm, featName);
135 
136 								for (ExternalRef roleExternalRef : predicate.getExternalRefs()) {
137 									if (roleExternalRef.getReference().length() == 0) {
138 										continue;
139 									}
140 									featName = "isRoleFor." + roleExternalRef.getReference();
141 									srlFeatures.put(term, featName);
142 //									featName = "isRoleFor." + roleExternalRef.getResource() + "." + roleExternalRef.getReference();
143 //									srlFeatures.put(term, featName);
144 								}
145 							}
146 
147 						}
148 					}
149 
150 				}
151 
152 			}
153 		}
154 
155 		// Preprocessing entities
156 		for (Entity entity : document.getEntities()) {
157 			for (Term term : entity.getTerms()) {
158 				entityTerms.put(term, entity.getType());
159 			}
160 		}
161 
162 		// Preprocessing opinions
163 		for (Opinion opinion : document.getOpinions()) {
164 			if (opinion.getOpinionExpression() == null) {
165 				continue;
166 			}
167 
168 			if (opinion.getLabel() == null) {
169 				continue;
170 			}
171 
172 			if (opinion.getOpinionExpression().getSpan() == null) {
173 				continue;
174 			}
175 
176 			boolean hasLabel = false;
177 			for (String label : labels) {
178 				if (opinion.getLabel().contains(label)) {
179 					hasLabel = true;
180 					break;
181 				}
182 			}
183 
184 			if (!hasLabel) {
185 				if (opinion.getLabel().equals("stanford-sentiment")) {
186 					if (opinion.getOpinionExpression().getSpan().size() == 1) {
187 						String pol = opinion.getOpinionExpression().getPolarity();
188 						if (pol.equals("Neutral")) {
189 							pol = "M";
190 						}
191 						stanfordTerms.put(opinion.getOpinionExpression().getSpan().getFirstTarget(), pol);
192 					}
193 				}
194 				continue;
195 			}
196 
197 			// for VUA dataset
198 			if (opinion.getOpinionExpression().getPolarity() != null) {
199 				if (opinion.getOpinionExpression().getPolarity().equals("NON-OPINIONATED")) {
200 					continue;
201 				}
202 			}
203 
204 //			boolean first = true;
205 			for (Term term : opinion.getOpinionExpression().getSpan().getTargets()) {
206 //				if (first) {
207 //					firstTerms.add(term);
208 //					first = false;
209 //				}
210 				opinionTerms.add(term);
211 			}
212 		}
213 
214 		Multimap<Term, SenticNet.Lexeme> senticnetMM = senticNet.match(document, document.getTerms());
215 		Multimap<Term, SubjectivityLexicon.Lexeme> subjectivityMM = subjectivityLexicon.match(document, document.getTerms());
216 		Multimap<Term, Intensities.Lexeme> intensitiesMM = intensities.match(document, document.getTerms());
217 
218 //		StringBuffer buffer = new StringBuffer();
219 		ArrayList<ArrayList<LinkedHashMap<String, String>>> ret = new ArrayList<>();
220 
221 		for (int i = 0; i < document.getNumSentences(); i++) {
222 
223 			ArrayList<LinkedHashMap<String, String>> sentence = new ArrayList<>();
224 			int sent = i + 1;
225 			String last = "O";
226 			for (Term term : document.getSentenceTerms(sent)) {
227 				LinkedHashMap<String, String> feats = new LinkedHashMap<>();
228 
229 				feats.put("TERM", term.getForm());
230 				feats.put("LEMMA", term.getLemma());
231 				if (FEATS_STEM) {
232 					feats.put("STEM", stemmer.stem(term.getLemma()));
233 				}
234 				if (FEATS_POS) {
235 					feats.put("P", term.getPos());
236 				}
237 				feats.put("M", term.getMorphofeat());
238 
239 				if (FEATS_DEP) {
240 					Dep to = document.getDepToTerm(term);
241 					feats.put("DEP.R", DEFAULT_NONE);
242 					feats.put("DEP.L", DEFAULT_NONE);
243 					feats.put("DEP.M", DEFAULT_NONE);
244 					feats.put("DEP.P", DEFAULT_NONE);
245 					if (to != null) {
246 						feats.put("DEP.R", to.getRfunc());
247 						feats.put("DEP.L", to.getRfunc() + "." + to.getFrom().getLemma());
248 						feats.put("DEP.M", to.getRfunc() + "." + to.getFrom().getMorphofeat());
249 						feats.put("DEP.P", to.getRfunc() + "." + to.getFrom().getPos());
250 					}
251 				}
252 
253 				if (FEATS_SRL) {
254 					for (String s : srlFeatures.get(term)) {
255 						feats.put("SRL." + s, DEFAULT_YES);
256 					}
257 				}
258 
259 				if (FEATS_ENTITIES) {
260 					String entity = entityTerms.get(term);
261 					if (entity == null) {
262 						entity = DEFAULT_NONE;
263 					}
264 					if (!skipEmpty || !entity.equals(DEFAULT_NONE)) {
265 						feats.put("E", entity);
266 					}
267 				}
268 
269 				if (FEATS_SST) {
270 					String SST = DEFAULT_NONE;
271 					for (ExternalRef externalRef : term.getExternalRefs()) {
272 						if (externalRef.getResource().equals("wn30-sst")) {
273 							SST = externalRef.getReference();
274 							break;
275 						}
276 					}
277 					if (!skipEmpty || !SST.equals(DEFAULT_NONE)) {
278 						feats.put("SST", SST);
279 					}
280 				}
281 
282 				if (FEATS_SENTICNET) {
283 					Collection<SenticNet.Lexeme> snLexemes = senticnetMM.get(term);
284 					String isInSenticNet = DEFAULT_NONE;
285 					String bigAptitude = DEFAULT_NONE;
286 					String bigAttention = DEFAULT_NONE;
287 					String bigPleasentness = DEFAULT_NONE;
288 					String bigPolarity = DEFAULT_NONE;
289 					String bigSensitivity = DEFAULT_NONE;
290 					if (snLexemes.size() > 0) {
291 						isInSenticNet = DEFAULT_YES;
292 						for (SenticNet.Lexeme lexeme : snLexemes) {
293 							bigAptitude = lexeme.getAptitude() > 0.5 ? DEFAULT_YES : DEFAULT_NONE;
294 							bigAttention = lexeme.getAttention() > 0.5 ? DEFAULT_YES : DEFAULT_NONE;
295 							bigPleasentness = lexeme.getPleasentness() > 0.5 ? DEFAULT_YES : DEFAULT_NONE;
296 							bigPolarity = lexeme.getPolarity() > 0.5 ? DEFAULT_YES : DEFAULT_NONE;
297 							bigSensitivity = lexeme.getSensitivity() > 0.5 ? DEFAULT_YES : DEFAULT_NONE;
298 							break;
299 						}
300 					}
301 					if (!skipEmpty || !isInSenticNet.equals(DEFAULT_NONE)) {
302 						feats.put("SNi", isInSenticNet);
303 					}
304 					if (!skipEmpty || !bigAptitude.equals(DEFAULT_NONE)) {
305 						feats.put("SNa", bigAptitude);
306 					}
307 					if (!skipEmpty || !bigAttention.equals(DEFAULT_NONE)) {
308 						feats.put("SNt", bigAttention);
309 					}
310 					if (!skipEmpty || !bigPleasentness.equals(DEFAULT_NONE)) {
311 						feats.put("SNl", bigPleasentness);
312 					}
313 					if (!skipEmpty || !bigPolarity.equals(DEFAULT_NONE)) {
314 						feats.put("SNp", bigPolarity);
315 					}
316 					if (!skipEmpty || !bigSensitivity.equals(DEFAULT_NONE)) {
317 						feats.put("SNs", bigSensitivity);
318 					}
319 				}
320 
321 				Collection<SubjectivityLexicon.Lexeme> slLexemes = null;
322 				if (FEATS_MOSCHITTI || FEATS_SUBJLEXICON) {
323 					slLexemes = subjectivityMM.get(term);
324 				}
325 
326 				if (FEATS_MOSCHITTI) {
327 					String subjLexM = DEFAULT_NONE;
328 					if (slLexemes.size() > 0) {
329 						String level = "weak";
330 						String pol = "neu";
331 						for (SubjectivityLexicon.Lexeme lexeme : slLexemes) {
332 							if (lexeme.isStrong()) {
333 								level = "str";
334 							}
335 							pol = lexeme.getPolarity().toString().substring(0, 3).toLowerCase();
336 							break;
337 						}
338 						subjLexM = level + "/" + pol;
339 					}
340 					if (!skipEmpty || !subjLexM.equals(DEFAULT_NONE)) {
341 						feats.put("MOSCHITTI", subjLexM);
342 					}
343 				}
344 
345 				if (FEATS_SUBJLEXICON) {
346 					String isInSubjLex = DEFAULT_NONE;
347 					String subjLexM = DEFAULT_NONE;
348 					String isInSubjLexStrong = DEFAULT_NONE;
349 					if (slLexemes.size() > 0) {
350 						isInSubjLex = DEFAULT_YES;
351 						for (SubjectivityLexicon.Lexeme lexeme : slLexemes) {
352 							if (lexeme.isStrong()) {
353 								isInSubjLexStrong = DEFAULT_YES;
354 							}
355 							subjLexM = lexeme.getPolarity().toString() + "." + isInSubjLexStrong;
356 							break;
357 						}
358 					}
359 					if (!skipEmpty || !isInSubjLex.equals(DEFAULT_NONE)) {
360 						feats.put("SLi", isInSubjLex);
361 					}
362 					if (!skipEmpty || !isInSubjLexStrong.equals(DEFAULT_NONE)) {
363 						feats.put("SLs", isInSubjLexStrong);
364 					}
365 					if (!skipEmpty || !subjLexM.equals(DEFAULT_NONE)) {
366 						feats.put("SLm", subjLexM);
367 					}
368 				}
369 
370 				if (FEATS_INTENSITY) {
371 					for (Intensities.Type type : Intensities.Type.values()) {
372 						String typeStr = DEFAULT_NONE;
373 						if (intensitiesMM.get(term).size() > 0) {
374 							for (Intensities.Lexeme lexeme : intensitiesMM.get(term)) {
375 								if (lexeme.getType().equals(type)) {
376 //								System.out.println(lexeme);
377 									typeStr = DEFAULT_YES;
378 								}
379 							}
380 						}
381 						char first = type.toString().charAt(0);
382 						if (!skipEmpty || !typeStr.equals(DEFAULT_NONE)) {
383 							feats.put("IN" + first, typeStr);
384 						}
385 					}
386 				}
387 
388 				if (FEATS_STANFORD) {
389 					String stanfordLabel = "M";
390 					if (stanfordTerms.containsKey(term)) {
391 						stanfordLabel = stanfordTerms.get(term);
392 					}
393 					String[] split = stanfordLabel.split("(?<=[\\S])[\\S]*\\s*");
394 					stanfordLabel = ArrayUtils.implode("", split);
395 					feats.put("STF", stanfordLabel);
396 				}
397 
398 				String wnSense = getWnFromTerm(term);
399 
400 				if (FEATS_WORDNET) {
401 					Set<String> termHypernyms = new HashSet<>();
402 					if (wnSense != null) {
403 						termHypernyms = WordNet.getHypernyms(wnSense, true);
404 					}
405 					if (hypernyms.size() > 0) {
406 						for (String hypernym : hypernyms) {
407 							if (termHypernyms.contains(hypernym)) {
408 								feats.put("WN." + hypernym, DEFAULT_YES);
409 							}
410 							else {
411 								if (!skipEmpty) {
412 									feats.put("WN." + hypernym, DEFAULT_NONE);
413 								}
414 							}
415 						}
416 					}
417 					else {
418 						for (String hypernym : termHypernyms) {
419 							feats.put("WN." + hypernym, DEFAULT_YES);
420 						}
421 					}
422 				}
423 
424 				if (FEATS_SENTIWORDNET) {
425 					if (!skipEmpty) {
426 						feats.put("SWN+", DEFAULT_NONE);
427 						feats.put("SWN-", DEFAULT_NONE);
428 					}
429 					if (wnSense != null) {
430 						PosNegPair swnPair = SentiWordNet.searchValue(wnSense);
431 						int posTimes = (int) Math.round(swnPair.getPosScore() / .125);
432 						int negTimes = (int) Math.round(swnPair.getNegScore() / .125);
433 						if (posTimes > 0) {
434 							feats.put("SWN+", Integer.toString(posTimes));
435 						}
436 						if (negTimes > 0) {
437 							feats.put("SWN-", Integer.toString(negTimes));
438 						}
439 					}
440 				}
441 
442 				if (opinionTerms.contains(term)) {
443 					if (last.equals("O")) {
444 						last = "B-t";
445 					}
446 					else {
447 						last = "I-t";
448 					}
449 				}
450 				else {
451 					last = "O";
452 				}
453 				feats.put(DEFAULT_CLASSIFICATION_LABEL, last);
454 
455 				sentence.add(feats);
456 			}
457 			ret.add(sentence);
458 		}
459 
460 		return ret;
461 	}
462 
463 	public static void main(String[] args) {
464 		try {
465 			final CommandLine cmd = CommandLine
466 					.parser()
467 					.withName("yamcha-extractor")
468 					.withHeader("Extract YAMCHA training set")
469 					.withOption("i", "input-folder", "the folder of the corpus", "DIR", CommandLine.Type.DIRECTORY, true, false, true)
470 					.withOption("w", "wordnet-path", "WordNet dict folder", "DIR", CommandLine.Type.DIRECTORY_EXISTING, true, false, false)
471 					.withOption("s", "sentiwordnet-path", "SentiWordNet file", "FILE", CommandLine.Type.FILE_EXISTING, true, false, false)
472 					.withOption("o", "output-folder", "output folder", "DIR", CommandLine.Type.DIRECTORY, true, false, true)
473 					.withOption("l", "label", "label(s), in comma separated format", "LABEL", CommandLine.Type.STRING, true, false, false)
474 					.withOption("t", "type", String.format("Output type, default %s", DEFAULT_TYPE), "TYPE", CommandLine.Type.STRING, true, false, false)
475 //					.withOption("e", "extensions", String.format("Input extensions (default %s)", CorpusAnnotator.DEFAULT_NAF_EXTENSIONS), "EXTS", CommandLine.Type.STRING, true, true, false)
476 					.withOption(null, "seed", "Seed", "NUM", CommandLine.Type.FLOAT, true, false, false)
477 					.withOption(null, "slot", String.format("Slot size, default %d", DEFAULT_SLOT_SIZE), "NUM", CommandLine.Type.NON_NEGATIVE_INTEGER, true, false, false)
478 					.withOption(null, "split", "Split part (training)", "NUM", CommandLine.Type.POSITIVE_FLOAT, true, false, false)
479 					.withOption(null, "skip-empty-train", "Skip empty sentences in training")
480 					.withOption(null, "skip-empty-test", "Skip empty sentences in test")
481 					.withOption(null, "train-list", "Trining set file list", "FILE", CommandLine.Type.FILE_EXISTING, true, false, false)
482 					.withOption(null, "test-list", "Test set file list", "FILE", CommandLine.Type.FILE_EXISTING, true, false, false)
483 					.withLogger(LoggerFactory.getLogger("eu.fbk")).parse(args);
484 
485 			File mainFolder = cmd.getOptionValue("i", File.class);
486 			File outputFolder = cmd.getOptionValue("o", File.class);
487 
488 			File wnFolder = cmd.getOptionValue("w", File.class);
489 			File swnFolder = cmd.getOptionValue("s", File.class);
490 
491 			String label = cmd.getOptionValue("l", String.class, DEFAULT_LABEL);
492 			String[] labels = label.split(",");
493 
494 			boolean skipEmptyTrain = cmd.hasOption("skip-empty-train");
495 			boolean skipEmptyTest = cmd.hasOption("skip-empty-test");
496 
497 			Type type = DEFAULT_TYPE;
498 
499 			String typeString = cmd.getOptionValue("type", String.class);
500 			if (typeString != null) {
501 				try {
502 					type = Type.valueOf(typeString.toUpperCase());
503 				} catch (Exception e) {
504 					throw new CommandLine.Exception(e.getMessage(), e);
505 				}
506 			}
507 
508 			if (type.equals(Type.YAMCHA)) {
509 				FEATS_SRL = false;
510 				FEATS_WORDNET = false;
511 				FEATS_SENTIWORDNET = false;
512 			}
513 
514 			Integer slotSize = cmd.getOptionValue("slot", Integer.class, DEFAULT_SLOT_SIZE);
515 			Float split = cmd.getOptionValue("split", Float.class, DEFAULT_SPLIT);
516 
517 //			char space = mallet ? ' ' : '\t';
518 
519 			Long seed = cmd.getOptionValue("seed", Long.class, DEFAULT_SEED);
520 
521 //			List<String> extensions = null;
522 //			if (cmd.hasOption("e")) {
523 //				extensions = cmd.getOptionValues("e", String.class);
524 //			}
525 //			if (extensions == null) {
526 //				extensions = CorpusAnnotator.DEFAULT_NAF_EXTENSIONS;
527 //			}
528 
529 			File trainList = cmd.getOptionValue("train-list", File.class);
530 			File testList = cmd.getOptionValue("test-list", File.class);
531 
532 			if ((trainList != null && testList == null) || (testList != null && trainList == null)) {
533 				throw new Exception("Train list and test list must be both declared or both missing");
534 			}
535 
536 			// ---
537 
538 			if (!outputFolder.exists()) {
539 				boolean createdOutputFolder = outputFolder.mkdirs();
540 				if (!createdOutputFolder) {
541 					LOGGER.error("Unable to create {}", outputFolder.getAbsolutePath());
542 					System.exit(1);
543 				}
544 			}
545 
546 			LOGGER.info("Loading resources");
547 			senticNet = SenticNet.getInstance();
548 			subjectivityLexicon = SubjectivityLexicon.getInstance();
549 			stemmer = StemmerFactory.getInstance(Locale.US);
550 			intensities = Intensities.getInstance();
551 
552 			if (wnFolder != null) {
553 				WordNet.setPath(wnFolder.getAbsolutePath());
554 				WordNet.init();
555 			}
556 
557 			if (swnFolder != null) {
558 				SentiWordNet.setPath(swnFolder);
559 				SentiWordNet.init();
560 			}
561 
562 			LOGGER.info("Parsing corpus");
563 			Corpus[] corpuses = new Corpus[2];
564 			if (trainList != null) {
565 				List<File> trainFiles = readList(trainList, mainFolder, "naf");
566 				List<File> testFiles = readList(testList, mainFolder, "naf");
567 				corpuses[0] = Corpus.create(false, trainFiles);
568 				corpuses[1] = Corpus.create(false, testFiles);
569 			}
570 			else {
571 				Corpus myCorpus = Corpus.create(false, mainFolder);
572 				corpuses = myCorpus.split(seed, split, 1.0f - split);
573 			}
574 
575 			// WordNet
576 			Set<String> allHypernyms = new TreeSet<>();
577 
578 			// Populate columns
579 			ArrayList<String> columns = new ArrayList<>();
580 			if (type.equals(Type.YAMCHA)) {
581 				if (wnFolder != null) {
582 					LOGGER.info("Collecting WordNet information");
583 					for (int i = 0; i < 2; i++) {
584 						for (Path file : corpuses[i].files()) {
585 							KAFDocument document = corpuses[i].get(file);
586 							for (Term term : document.getTerms()) {
587 								String wnSense = getWnFromTerm(term);
588 								if (wnSense != null && wnSense.length() > 0) {
589 									Set<String> hypernyms = WordNet.getHypernyms(wnSense, true);
590 									allHypernyms.addAll(hypernyms);
591 								}
592 							}
593 						}
594 					}
595 					LOGGER.info("Loaded {} hypernyms", allHypernyms.size());
596 				}
597 				for (Path file : corpuses[0].files()) {
598 					KAFDocument document = corpuses[0].get(file);
599 					ArrayList<ArrayList<LinkedHashMap<String, String>>> sentences = extractFeats(document, labels, allHypernyms, false);
600 					if (columns.size() == 0 && sentences.size() > 0 && sentences.get(0).size() > 0) {
601 						for (String key : sentences.get(0).get(0).keySet()) {
602 							if (!key.equals(DEFAULT_CLASSIFICATION_LABEL)) {
603 								columns.add(key);
604 							}
605 						}
606 						break;
607 					}
608 				}
609 			}
610 
611 			// Train
612 			LOGGER.info("Loading training data");
613 			File trainDataFile = new File(outputFolder.getAbsolutePath() + File.separator + "data.train");
614 			BufferedWriter trainWriter = new BufferedWriter(new FileWriter(trainDataFile));
615 			for (Path file : corpuses[0].files()) {
616 				KAFDocument document = corpuses[0].get(file);
617 				writeFeats(document, trainWriter, labels, skipEmptyTrain, allHypernyms, type, slotSize);
618 			}
619 			trainWriter.close();
620 
621 			// Test
622 			LOGGER.info("Loading test data");
623 			File testDataFile = new File(outputFolder.getAbsolutePath() + File.separator + "data.test");
624 			BufferedWriter testWriter = new BufferedWriter(new FileWriter(testDataFile));
625 			for (Path file : corpuses[1].files()) {
626 				KAFDocument document = corpuses[1].get(file);
627 				writeFeats(document, testWriter, labels, skipEmptyTest, allHypernyms, type, slotSize);
628 			}
629 			testWriter.close();
630 
631 			if (type.equals(Type.YAMCHA)) {
632 				File templateFile = new File(outputFolder.getAbsolutePath() + File.separator + "template.crf");
633 				BufferedWriter templateWriter = new BufferedWriter(new FileWriter(templateFile));
634 				StringBuffer buffer = new StringBuffer();
635 
636 				int featNo = 0;
637 				for (int i = 0; i < columns.size(); i++) {
638 					String colName = columns.get(i);
639 
640 					if (colName.equals(DEFAULT_CLASSIFICATION_LABEL)) {
641 						continue;
642 					}
643 
644 					buffer.append("#").append(colName).append("\n");
645 
646 					if (!colName.startsWith("WN")) {
647 						for (int offset = -slotSize; offset <= slotSize; offset++) {
648 							buffer.append("U").append(++featNo).append(":")
649 									.append("%x[").append(offset).append(",").append(i).append("]")
650 									.append("\n");
651 						}
652 					}
653 					else {
654 						buffer.append("U").append(++featNo).append(":")
655 								.append("%x[").append("0").append(",").append(i).append("]")
656 								.append("\n");
657 					}
658 
659 					if (DOUBLE_FEATURES.contains(colName)) {
660 						for (int offset = -slotSize; offset <= slotSize - 1; offset++) {
661 							buffer.append("U").append(++featNo).append(":")
662 									.append("%x[").append(offset).append(",").append(i).append("]")
663 									.append("/")
664 									.append("%x[").append(offset + 1).append(",").append(i).append("]")
665 									.append("\n");
666 						}
667 					}
668 
669 					if (TRIPLE_FEATURES.contains(colName)) {
670 						for (int offset = -slotSize; offset <= slotSize - 2; offset++) {
671 							buffer.append("U").append(++featNo).append(":")
672 									.append("%x[").append(offset).append(",").append(i).append("]")
673 									.append("/")
674 									.append("%x[").append(offset + 1).append(",").append(i).append("]")
675 									.append("/")
676 									.append("%x[").append(offset + 2).append(",").append(i).append("]")
677 									.append("\n");
678 						}
679 					}
680 
681 					buffer.append("\n");
682 				}
683 
684 				buffer.append("#BIGRAMS\n");
685 				buffer.append("B").append("\n");
686 
687 				templateWriter.write(buffer.toString());
688 				templateWriter.close();
689 			}
690 
691 			LOGGER.debug(columns.toString());
692 
693 		} catch (final Throwable ex) {
694 			CommandLine.fail(ex);
695 		}
696 	}
697 
698 	private static String getWnFromTerm(Term term) {
699 		String wnSense = term.getWordnetSense();
700 		if (wnSense == null || wnSense.length() == 0) {
701 			for (ExternalRef externalRef : term.getExternalRefs()) {
702 				if (externalRef.getResource().equals("wn30-ukb")) {
703 					wnSense = externalRef.getReference();
704 					if (wnSense != null && wnSense.length() > 0) {
705 						break;
706 					}
707 				}
708 			}
709 		}
710 
711 		return wnSense;
712 	}
713 
714 	public static List<File> readList(File fileList, File baseFolder, @Nullable String replaceExtension) throws IOException {
715 
716 		List<File> ret = new ArrayList<>();
717 
718 		BufferedReader reader = null;
719 		try {
720 			reader = new BufferedReader(new FileReader(fileList));
721 
722 			String line;
723 			while ((line = reader.readLine()) != null) {
724 				line = line.trim();
725 				if (line.length() == 0) {
726 					continue;
727 				}
728 
729 				String fileName = baseFolder.getAbsolutePath() + File.separator + line;
730 
731 				if (replaceExtension != null) {
732 					fileName = fileName.replaceAll("\\.[^\\.]+$", "." + replaceExtension);
733 				}
734 
735 				File file = new File(fileName);
736 				if (!file.exists()) {
737 					LOGGER.warn("File {} does not exist", fileName);
738 					continue;
739 				}
740 
741 				ret.add(file);
742 			}
743 		} catch (Exception e) {
744 			LOGGER.error(e.getMessage());
745 		} finally {
746 			if (reader != null) {
747 				reader.close();
748 			}
749 		}
750 		return ret;
751 	}
752 
753 	private static void writeFeats(KAFDocument document, BufferedWriter writer, String[] labels, boolean skipEmptySentences, Set<String> hypernyms, Type type, int slotSize) throws IOException {
754 
755 		char space = '\t';
756 		OutputType outputType = OutputType.SINGLE;
757 		boolean classBefore = false;
758 		boolean skipEmptyFeatures = true;
759 		String featurePrefix = "";
760 
761 		switch (type) {
762 			case MALLET:
763 				space = ' ';
764 				break;
765 			case MALLET_WINDOW:
766 				space = ' ';
767 				outputType = OutputType.COMPLETE;
768 				break;
769 			case CRFSUITE:
770 				outputType = OutputType.COMPLETE;
771 				classBefore = true;
772 				break;
773 			case YAMCHA:
774 				skipEmptyFeatures = false;
775 				break;
776 			case WAPITI:
777 				outputType = OutputType.COMPLETE;
778 				featurePrefix = "u:";
779 				break;
780 		}
781 
782 		ArrayList<ArrayList<LinkedHashMap<String, String>>> sentences = extractFeats(document, labels, hypernyms, false);
783 
784 		String string1 = "";
785 		String string2 = "";
786 		if (classBefore) {
787 			string1 = Character.toString(space);
788 		}
789 		else {
790 			string2 = Character.toString(space);
791 		}
792 
793 		string1 += featurePrefix;
794 
795 		StringBuffer bigBuffer = new StringBuffer();
796 
797 		for (ArrayList<LinkedHashMap<String, String>> sentence : sentences) {
798 
799 			boolean isAnnotated = false;
800 			StringBuffer buffer = new StringBuffer();
801 
802 			for (int i = 0; i < sentence.size(); i++) {
803 				LinkedHashMap<String, String> token = sentence.get(i);
804 				String classification = token.get(DEFAULT_CLASSIFICATION_LABEL);
805 
806 				if (classBefore) {
807 					buffer.append(classification);
808 				}
809 
810 				switch (outputType) {
811 					case SINGLE:
812 						// Features
813 						for (String key : token.keySet()) {
814 							if (key.equals(DEFAULT_CLASSIFICATION_LABEL)) {
815 								continue;
816 							}
817 
818 							String value = token.get(key);
819 
820 							if (key.startsWith("WN")) {
821 								buffer.append(string1).append(key).append(string2);
822 							}
823 							else {
824 								if (!skipEmptyFeatures || !value.equals(DEFAULT_NONE)) {
825 									buffer.append(string1).append(key).append(".").append(value).append(string2);
826 								}
827 							}
828 						}
829 
830 						break;
831 					case COMPLETE:
832 
833 						// Sentence features
834 						if (i == 0) {
835 							buffer.append(string1).append("BOS").append(string2);
836 						}
837 
838 						// Other features
839 						for (String key : token.keySet()) {
840 							if (key.equals(DEFAULT_CLASSIFICATION_LABEL)) {
841 								continue;
842 							}
843 
844 //							String value = token.get(key);
845 
846 							if (key.startsWith("WN")) {
847 								buffer.append(string1).append(key).append(string2);
848 							}
849 							else {
850 								for (int offset = -slotSize; offset <= slotSize; offset++) {
851 									LinkedHashMap<String, String> thisToken;
852 									try {
853 										thisToken = sentence.get(i + offset);
854 									} catch (IndexOutOfBoundsException e) {
855 										continue;
856 									}
857 
858 									String thisValue = thisToken.get(key);
859 									if (thisValue == null) {
860 										continue;
861 									}
862 									if (!skipEmptyFeatures || !thisValue.equals(DEFAULT_NONE)) {
863 										buffer.append(string1)
864 												.append("[").append(offset).append("]")
865 												.append(key).append(".").append(thisValue)
866 												.append(string2);
867 									}
868 								}
869 							}
870 						}
871 
872 						// Sentence features
873 						if (i == sentence.size() - 1) {
874 							buffer.append(string1).append("EOS").append(string2);
875 						}
876 
877 						break;
878 				}
879 
880 				if (!classBefore) {
881 					buffer.append(classification);
882 				}
883 
884 				buffer.append("\n");
885 			}
886 			buffer.append("\n");
887 			if (!skipEmptySentences || isAnnotated) {
888 				bigBuffer.append(buffer.toString());
889 			}
890 		}
891 
892 		writer.write(bigBuffer.toString());
893 	}
894 }