1 package eu.fbk.dkm.pikes.raid;
2
3 import com.google.common.collect.HashMultimap;
4 import com.google.common.collect.Multimap;
5 import eu.fbk.dkm.pikes.naflib.Corpus;
6 import eu.fbk.dkm.pikes.resources.*;
7 import eu.fbk.utils.analysis.stemmer.Stemmer;
8 import eu.fbk.utils.analysis.stemmer.StemmerFactory;
9 import eu.fbk.utils.core.ArrayUtils;
10 import eu.fbk.utils.core.CommandLine;
11 import ixa.kaflib.*;
12 import org.slf4j.LoggerFactory;
13
14 import javax.annotation.Nullable;
15 import java.io.*;
16 import java.nio.file.Path;
17 import java.util.*;
18
19
20
21
22
23 public class CreateTrainingForExpression {
24
25 private static final org.slf4j.Logger LOGGER = LoggerFactory.getLogger(CreateTrainingForExpression.class);
26 private static final String DEFAULT_LABEL = "gold";
27
28 private static SenticNet senticNet;
29 private static SubjectivityLexicon subjectivityLexicon;
30 private static Stemmer stemmer;
31 private static Intensities intensities;
32
33 public enum Features {
34 STANFORD,
35 SENTICNET,
36 SUBJLEXICON,
37 INTENSITY,
38 WORDNET,
39 SENTIWORDNET,
40 MOSCHITTI,
41 SST,
42 ENTITIES,
43 STEM,
44 POS,
45 DEP,
46 SRL
47 }
48
49 private static boolean MAJORITY = false;
50
51 private static boolean FEATS_STANFORD = false;
52 private static boolean FEATS_SENTICNET = MAJORITY;
53 private static boolean FEATS_SUBJLEXICON = MAJORITY;
54 private static boolean FEATS_INTENSITY = MAJORITY;
55 private static boolean FEATS_WORDNET = MAJORITY;
56 private static boolean FEATS_SENTIWORDNET = false;
57
58 private static boolean FEATS_MOSCHITTI = true;
59
60 private static boolean FEATS_SST = true;
61 private static boolean FEATS_ENTITIES = true;
62 private static boolean FEATS_STEM = true;
63 private static boolean FEATS_POS = true;
64 private static boolean FEATS_DEP = MAJORITY;
65 private static boolean FEATS_SRL = MAJORITY;
66
67 private static Long DEFAULT_SEED = 2l;
68 private static String DEFAULT_CLASSIFICATION_LABEL = "_CLASS";
69 private static Integer DEFAULT_SLOT_SIZE = 1;
70 private static Float DEFAULT_SPLIT = 0.75f;
71
72 private static String DEFAULT_NONE = "-";
73 private static String DEFAULT_YES = "Y";
74
75 private static HashSet<String> DOUBLE_FEATURES = new HashSet<>();
76
77 static {
78 DOUBLE_FEATURES.add("LEMMA");
79 DOUBLE_FEATURES.add("P");
80 DOUBLE_FEATURES.add("E");
81 DOUBLE_FEATURES.add("SST");
82 }
83
84 private static HashSet<String> TRIPLE_FEATURES = new HashSet<>();
85
86 static {
87 TRIPLE_FEATURES.add("LEMMA");
88 TRIPLE_FEATURES.add("P");
89 }
90
91 public enum Type {
92 MALLET, MALLET_WINDOW, YAMCHA, CRFSUITE, WAPITI
93 }
94
95 public enum OutputType {
96 SINGLE, COMPLETE
97 }
98
99 static Type DEFAULT_TYPE = Type.CRFSUITE;
100
101 public static ArrayList<ArrayList<LinkedHashMap<String, String>>> extractFeats(KAFDocument document, String[] labels, Set<String> hypernyms, boolean skipEmpty) {
102 HashSet<Term> opinionTerms = new HashSet<>();
103 HashMap<Term, String> stanfordTerms = new HashMap<>();
104 HashMap<Term, String> entityTerms = new HashMap<>();
105
106
107 HashMultimap<Term, String> srlFeatures = HashMultimap.create();
108 String featName;
109 if (FEATS_SRL) {
110 for (Predicate predicate : document.getPredicates()) {
111 for (Term term : predicate.getTerms()) {
112 srlFeatures.put(term, "isPredicate");
113 for (ExternalRef externalRef : predicate.getExternalRefs()) {
114 if (externalRef.getReference().length() == 0) {
115 continue;
116 }
117 featName = "isPredicate." + externalRef.getResource() + "." + externalRef.getReference();
118 srlFeatures.put(term, featName);
119 }
120 for (Predicate.Role role : predicate.getRoles()) {
121 for (ExternalRef externalRef : role.getExternalRefs()) {
122 if (externalRef.getReference().length() == 0) {
123 continue;
124 }
125
126
127 featName = "hasRole." + externalRef.getReference();
128 srlFeatures.put(term, featName);
129
130 for (Term roleTerm : role.getTerms()) {
131 featName = "isRole";
132 srlFeatures.put(roleTerm, featName);
133
134
135
136 for (ExternalRef roleExternalRef : predicate.getExternalRefs()) {
137 if (roleExternalRef.getReference().length() == 0) {
138 continue;
139 }
140 featName = "isRoleFor." + roleExternalRef.getReference();
141 srlFeatures.put(term, featName);
142
143
144 }
145 }
146
147 }
148 }
149
150 }
151
152 }
153 }
154
155
156 for (Entity entity : document.getEntities()) {
157 for (Term term : entity.getTerms()) {
158 entityTerms.put(term, entity.getType());
159 }
160 }
161
162
163 for (Opinion opinion : document.getOpinions()) {
164 if (opinion.getOpinionExpression() == null) {
165 continue;
166 }
167
168 if (opinion.getLabel() == null) {
169 continue;
170 }
171
172 if (opinion.getOpinionExpression().getSpan() == null) {
173 continue;
174 }
175
176 boolean hasLabel = false;
177 for (String label : labels) {
178 if (opinion.getLabel().contains(label)) {
179 hasLabel = true;
180 break;
181 }
182 }
183
184 if (!hasLabel) {
185 if (opinion.getLabel().equals("stanford-sentiment")) {
186 if (opinion.getOpinionExpression().getSpan().size() == 1) {
187 String pol = opinion.getOpinionExpression().getPolarity();
188 if (pol.equals("Neutral")) {
189 pol = "M";
190 }
191 stanfordTerms.put(opinion.getOpinionExpression().getSpan().getFirstTarget(), pol);
192 }
193 }
194 continue;
195 }
196
197
198 if (opinion.getOpinionExpression().getPolarity() != null) {
199 if (opinion.getOpinionExpression().getPolarity().equals("NON-OPINIONATED")) {
200 continue;
201 }
202 }
203
204
205 for (Term term : opinion.getOpinionExpression().getSpan().getTargets()) {
206
207
208
209
210 opinionTerms.add(term);
211 }
212 }
213
214 Multimap<Term, SenticNet.Lexeme> senticnetMM = senticNet.match(document, document.getTerms());
215 Multimap<Term, SubjectivityLexicon.Lexeme> subjectivityMM = subjectivityLexicon.match(document, document.getTerms());
216 Multimap<Term, Intensities.Lexeme> intensitiesMM = intensities.match(document, document.getTerms());
217
218
219 ArrayList<ArrayList<LinkedHashMap<String, String>>> ret = new ArrayList<>();
220
221 for (int i = 0; i < document.getNumSentences(); i++) {
222
223 ArrayList<LinkedHashMap<String, String>> sentence = new ArrayList<>();
224 int sent = i + 1;
225 String last = "O";
226 for (Term term : document.getSentenceTerms(sent)) {
227 LinkedHashMap<String, String> feats = new LinkedHashMap<>();
228
229 feats.put("TERM", term.getForm());
230 feats.put("LEMMA", term.getLemma());
231 if (FEATS_STEM) {
232 feats.put("STEM", stemmer.stem(term.getLemma()));
233 }
234 if (FEATS_POS) {
235 feats.put("P", term.getPos());
236 }
237 feats.put("M", term.getMorphofeat());
238
239 if (FEATS_DEP) {
240 Dep to = document.getDepToTerm(term);
241 feats.put("DEP.R", DEFAULT_NONE);
242 feats.put("DEP.L", DEFAULT_NONE);
243 feats.put("DEP.M", DEFAULT_NONE);
244 feats.put("DEP.P", DEFAULT_NONE);
245 if (to != null) {
246 feats.put("DEP.R", to.getRfunc());
247 feats.put("DEP.L", to.getRfunc() + "." + to.getFrom().getLemma());
248 feats.put("DEP.M", to.getRfunc() + "." + to.getFrom().getMorphofeat());
249 feats.put("DEP.P", to.getRfunc() + "." + to.getFrom().getPos());
250 }
251 }
252
253 if (FEATS_SRL) {
254 for (String s : srlFeatures.get(term)) {
255 feats.put("SRL." + s, DEFAULT_YES);
256 }
257 }
258
259 if (FEATS_ENTITIES) {
260 String entity = entityTerms.get(term);
261 if (entity == null) {
262 entity = DEFAULT_NONE;
263 }
264 if (!skipEmpty || !entity.equals(DEFAULT_NONE)) {
265 feats.put("E", entity);
266 }
267 }
268
269 if (FEATS_SST) {
270 String SST = DEFAULT_NONE;
271 for (ExternalRef externalRef : term.getExternalRefs()) {
272 if (externalRef.getResource().equals("wn30-sst")) {
273 SST = externalRef.getReference();
274 break;
275 }
276 }
277 if (!skipEmpty || !SST.equals(DEFAULT_NONE)) {
278 feats.put("SST", SST);
279 }
280 }
281
282 if (FEATS_SENTICNET) {
283 Collection<SenticNet.Lexeme> snLexemes = senticnetMM.get(term);
284 String isInSenticNet = DEFAULT_NONE;
285 String bigAptitude = DEFAULT_NONE;
286 String bigAttention = DEFAULT_NONE;
287 String bigPleasentness = DEFAULT_NONE;
288 String bigPolarity = DEFAULT_NONE;
289 String bigSensitivity = DEFAULT_NONE;
290 if (snLexemes.size() > 0) {
291 isInSenticNet = DEFAULT_YES;
292 for (SenticNet.Lexeme lexeme : snLexemes) {
293 bigAptitude = lexeme.getAptitude() > 0.5 ? DEFAULT_YES : DEFAULT_NONE;
294 bigAttention = lexeme.getAttention() > 0.5 ? DEFAULT_YES : DEFAULT_NONE;
295 bigPleasentness = lexeme.getPleasentness() > 0.5 ? DEFAULT_YES : DEFAULT_NONE;
296 bigPolarity = lexeme.getPolarity() > 0.5 ? DEFAULT_YES : DEFAULT_NONE;
297 bigSensitivity = lexeme.getSensitivity() > 0.5 ? DEFAULT_YES : DEFAULT_NONE;
298 break;
299 }
300 }
301 if (!skipEmpty || !isInSenticNet.equals(DEFAULT_NONE)) {
302 feats.put("SNi", isInSenticNet);
303 }
304 if (!skipEmpty || !bigAptitude.equals(DEFAULT_NONE)) {
305 feats.put("SNa", bigAptitude);
306 }
307 if (!skipEmpty || !bigAttention.equals(DEFAULT_NONE)) {
308 feats.put("SNt", bigAttention);
309 }
310 if (!skipEmpty || !bigPleasentness.equals(DEFAULT_NONE)) {
311 feats.put("SNl", bigPleasentness);
312 }
313 if (!skipEmpty || !bigPolarity.equals(DEFAULT_NONE)) {
314 feats.put("SNp", bigPolarity);
315 }
316 if (!skipEmpty || !bigSensitivity.equals(DEFAULT_NONE)) {
317 feats.put("SNs", bigSensitivity);
318 }
319 }
320
321 Collection<SubjectivityLexicon.Lexeme> slLexemes = null;
322 if (FEATS_MOSCHITTI || FEATS_SUBJLEXICON) {
323 slLexemes = subjectivityMM.get(term);
324 }
325
326 if (FEATS_MOSCHITTI) {
327 String subjLexM = DEFAULT_NONE;
328 if (slLexemes.size() > 0) {
329 String level = "weak";
330 String pol = "neu";
331 for (SubjectivityLexicon.Lexeme lexeme : slLexemes) {
332 if (lexeme.isStrong()) {
333 level = "str";
334 }
335 pol = lexeme.getPolarity().toString().substring(0, 3).toLowerCase();
336 break;
337 }
338 subjLexM = level + "/" + pol;
339 }
340 if (!skipEmpty || !subjLexM.equals(DEFAULT_NONE)) {
341 feats.put("MOSCHITTI", subjLexM);
342 }
343 }
344
345 if (FEATS_SUBJLEXICON) {
346 String isInSubjLex = DEFAULT_NONE;
347 String subjLexM = DEFAULT_NONE;
348 String isInSubjLexStrong = DEFAULT_NONE;
349 if (slLexemes.size() > 0) {
350 isInSubjLex = DEFAULT_YES;
351 for (SubjectivityLexicon.Lexeme lexeme : slLexemes) {
352 if (lexeme.isStrong()) {
353 isInSubjLexStrong = DEFAULT_YES;
354 }
355 subjLexM = lexeme.getPolarity().toString() + "." + isInSubjLexStrong;
356 break;
357 }
358 }
359 if (!skipEmpty || !isInSubjLex.equals(DEFAULT_NONE)) {
360 feats.put("SLi", isInSubjLex);
361 }
362 if (!skipEmpty || !isInSubjLexStrong.equals(DEFAULT_NONE)) {
363 feats.put("SLs", isInSubjLexStrong);
364 }
365 if (!skipEmpty || !subjLexM.equals(DEFAULT_NONE)) {
366 feats.put("SLm", subjLexM);
367 }
368 }
369
370 if (FEATS_INTENSITY) {
371 for (Intensities.Type type : Intensities.Type.values()) {
372 String typeStr = DEFAULT_NONE;
373 if (intensitiesMM.get(term).size() > 0) {
374 for (Intensities.Lexeme lexeme : intensitiesMM.get(term)) {
375 if (lexeme.getType().equals(type)) {
376
377 typeStr = DEFAULT_YES;
378 }
379 }
380 }
381 char first = type.toString().charAt(0);
382 if (!skipEmpty || !typeStr.equals(DEFAULT_NONE)) {
383 feats.put("IN" + first, typeStr);
384 }
385 }
386 }
387
388 if (FEATS_STANFORD) {
389 String stanfordLabel = "M";
390 if (stanfordTerms.containsKey(term)) {
391 stanfordLabel = stanfordTerms.get(term);
392 }
393 String[] split = stanfordLabel.split("(?<=[\\S])[\\S]*\\s*");
394 stanfordLabel = ArrayUtils.implode("", split);
395 feats.put("STF", stanfordLabel);
396 }
397
398 String wnSense = getWnFromTerm(term);
399
400 if (FEATS_WORDNET) {
401 Set<String> termHypernyms = new HashSet<>();
402 if (wnSense != null) {
403 termHypernyms = WordNet.getHypernyms(wnSense, true);
404 }
405 if (hypernyms.size() > 0) {
406 for (String hypernym : hypernyms) {
407 if (termHypernyms.contains(hypernym)) {
408 feats.put("WN." + hypernym, DEFAULT_YES);
409 }
410 else {
411 if (!skipEmpty) {
412 feats.put("WN." + hypernym, DEFAULT_NONE);
413 }
414 }
415 }
416 }
417 else {
418 for (String hypernym : termHypernyms) {
419 feats.put("WN." + hypernym, DEFAULT_YES);
420 }
421 }
422 }
423
424 if (FEATS_SENTIWORDNET) {
425 if (!skipEmpty) {
426 feats.put("SWN+", DEFAULT_NONE);
427 feats.put("SWN-", DEFAULT_NONE);
428 }
429 if (wnSense != null) {
430 PosNegPair swnPair = SentiWordNet.searchValue(wnSense);
431 int posTimes = (int) Math.round(swnPair.getPosScore() / .125);
432 int negTimes = (int) Math.round(swnPair.getNegScore() / .125);
433 if (posTimes > 0) {
434 feats.put("SWN+", Integer.toString(posTimes));
435 }
436 if (negTimes > 0) {
437 feats.put("SWN-", Integer.toString(negTimes));
438 }
439 }
440 }
441
442 if (opinionTerms.contains(term)) {
443 if (last.equals("O")) {
444 last = "B-t";
445 }
446 else {
447 last = "I-t";
448 }
449 }
450 else {
451 last = "O";
452 }
453 feats.put(DEFAULT_CLASSIFICATION_LABEL, last);
454
455 sentence.add(feats);
456 }
457 ret.add(sentence);
458 }
459
460 return ret;
461 }
462
463 public static void main(String[] args) {
464 try {
465 final CommandLine cmd = CommandLine
466 .parser()
467 .withName("yamcha-extractor")
468 .withHeader("Extract YAMCHA training set")
469 .withOption("i", "input-folder", "the folder of the corpus", "DIR", CommandLine.Type.DIRECTORY, true, false, true)
470 .withOption("w", "wordnet-path", "WordNet dict folder", "DIR", CommandLine.Type.DIRECTORY_EXISTING, true, false, false)
471 .withOption("s", "sentiwordnet-path", "SentiWordNet file", "FILE", CommandLine.Type.FILE_EXISTING, true, false, false)
472 .withOption("o", "output-folder", "output folder", "DIR", CommandLine.Type.DIRECTORY, true, false, true)
473 .withOption("l", "label", "label(s), in comma separated format", "LABEL", CommandLine.Type.STRING, true, false, false)
474 .withOption("t", "type", String.format("Output type, default %s", DEFAULT_TYPE), "TYPE", CommandLine.Type.STRING, true, false, false)
475
476 .withOption(null, "seed", "Seed", "NUM", CommandLine.Type.FLOAT, true, false, false)
477 .withOption(null, "slot", String.format("Slot size, default %d", DEFAULT_SLOT_SIZE), "NUM", CommandLine.Type.NON_NEGATIVE_INTEGER, true, false, false)
478 .withOption(null, "split", "Split part (training)", "NUM", CommandLine.Type.POSITIVE_FLOAT, true, false, false)
479 .withOption(null, "skip-empty-train", "Skip empty sentences in training")
480 .withOption(null, "skip-empty-test", "Skip empty sentences in test")
481 .withOption(null, "train-list", "Trining set file list", "FILE", CommandLine.Type.FILE_EXISTING, true, false, false)
482 .withOption(null, "test-list", "Test set file list", "FILE", CommandLine.Type.FILE_EXISTING, true, false, false)
483 .withLogger(LoggerFactory.getLogger("eu.fbk")).parse(args);
484
485 File mainFolder = cmd.getOptionValue("i", File.class);
486 File outputFolder = cmd.getOptionValue("o", File.class);
487
488 File wnFolder = cmd.getOptionValue("w", File.class);
489 File swnFolder = cmd.getOptionValue("s", File.class);
490
491 String label = cmd.getOptionValue("l", String.class, DEFAULT_LABEL);
492 String[] labels = label.split(",");
493
494 boolean skipEmptyTrain = cmd.hasOption("skip-empty-train");
495 boolean skipEmptyTest = cmd.hasOption("skip-empty-test");
496
497 Type type = DEFAULT_TYPE;
498
499 String typeString = cmd.getOptionValue("type", String.class);
500 if (typeString != null) {
501 try {
502 type = Type.valueOf(typeString.toUpperCase());
503 } catch (Exception e) {
504 throw new CommandLine.Exception(e.getMessage(), e);
505 }
506 }
507
508 if (type.equals(Type.YAMCHA)) {
509 FEATS_SRL = false;
510 FEATS_WORDNET = false;
511 FEATS_SENTIWORDNET = false;
512 }
513
514 Integer slotSize = cmd.getOptionValue("slot", Integer.class, DEFAULT_SLOT_SIZE);
515 Float split = cmd.getOptionValue("split", Float.class, DEFAULT_SPLIT);
516
517
518
519 Long seed = cmd.getOptionValue("seed", Long.class, DEFAULT_SEED);
520
521
522
523
524
525
526
527
528
529 File trainList = cmd.getOptionValue("train-list", File.class);
530 File testList = cmd.getOptionValue("test-list", File.class);
531
532 if ((trainList != null && testList == null) || (testList != null && trainList == null)) {
533 throw new Exception("Train list and test list must be both declared or both missing");
534 }
535
536
537
538 if (!outputFolder.exists()) {
539 boolean createdOutputFolder = outputFolder.mkdirs();
540 if (!createdOutputFolder) {
541 LOGGER.error("Unable to create {}", outputFolder.getAbsolutePath());
542 System.exit(1);
543 }
544 }
545
546 LOGGER.info("Loading resources");
547 senticNet = SenticNet.getInstance();
548 subjectivityLexicon = SubjectivityLexicon.getInstance();
549 stemmer = StemmerFactory.getInstance(Locale.US);
550 intensities = Intensities.getInstance();
551
552 if (wnFolder != null) {
553 WordNet.setPath(wnFolder.getAbsolutePath());
554 WordNet.init();
555 }
556
557 if (swnFolder != null) {
558 SentiWordNet.setPath(swnFolder);
559 SentiWordNet.init();
560 }
561
562 LOGGER.info("Parsing corpus");
563 Corpus[] corpuses = new Corpus[2];
564 if (trainList != null) {
565 List<File> trainFiles = readList(trainList, mainFolder, "naf");
566 List<File> testFiles = readList(testList, mainFolder, "naf");
567 corpuses[0] = Corpus.create(false, trainFiles);
568 corpuses[1] = Corpus.create(false, testFiles);
569 }
570 else {
571 Corpus myCorpus = Corpus.create(false, mainFolder);
572 corpuses = myCorpus.split(seed, split, 1.0f - split);
573 }
574
575
576 Set<String> allHypernyms = new TreeSet<>();
577
578
579 ArrayList<String> columns = new ArrayList<>();
580 if (type.equals(Type.YAMCHA)) {
581 if (wnFolder != null) {
582 LOGGER.info("Collecting WordNet information");
583 for (int i = 0; i < 2; i++) {
584 for (Path file : corpuses[i].files()) {
585 KAFDocument document = corpuses[i].get(file);
586 for (Term term : document.getTerms()) {
587 String wnSense = getWnFromTerm(term);
588 if (wnSense != null && wnSense.length() > 0) {
589 Set<String> hypernyms = WordNet.getHypernyms(wnSense, true);
590 allHypernyms.addAll(hypernyms);
591 }
592 }
593 }
594 }
595 LOGGER.info("Loaded {} hypernyms", allHypernyms.size());
596 }
597 for (Path file : corpuses[0].files()) {
598 KAFDocument document = corpuses[0].get(file);
599 ArrayList<ArrayList<LinkedHashMap<String, String>>> sentences = extractFeats(document, labels, allHypernyms, false);
600 if (columns.size() == 0 && sentences.size() > 0 && sentences.get(0).size() > 0) {
601 for (String key : sentences.get(0).get(0).keySet()) {
602 if (!key.equals(DEFAULT_CLASSIFICATION_LABEL)) {
603 columns.add(key);
604 }
605 }
606 break;
607 }
608 }
609 }
610
611
612 LOGGER.info("Loading training data");
613 File trainDataFile = new File(outputFolder.getAbsolutePath() + File.separator + "data.train");
614 BufferedWriter trainWriter = new BufferedWriter(new FileWriter(trainDataFile));
615 for (Path file : corpuses[0].files()) {
616 KAFDocument document = corpuses[0].get(file);
617 writeFeats(document, trainWriter, labels, skipEmptyTrain, allHypernyms, type, slotSize);
618 }
619 trainWriter.close();
620
621
622 LOGGER.info("Loading test data");
623 File testDataFile = new File(outputFolder.getAbsolutePath() + File.separator + "data.test");
624 BufferedWriter testWriter = new BufferedWriter(new FileWriter(testDataFile));
625 for (Path file : corpuses[1].files()) {
626 KAFDocument document = corpuses[1].get(file);
627 writeFeats(document, testWriter, labels, skipEmptyTest, allHypernyms, type, slotSize);
628 }
629 testWriter.close();
630
631 if (type.equals(Type.YAMCHA)) {
632 File templateFile = new File(outputFolder.getAbsolutePath() + File.separator + "template.crf");
633 BufferedWriter templateWriter = new BufferedWriter(new FileWriter(templateFile));
634 StringBuffer buffer = new StringBuffer();
635
636 int featNo = 0;
637 for (int i = 0; i < columns.size(); i++) {
638 String colName = columns.get(i);
639
640 if (colName.equals(DEFAULT_CLASSIFICATION_LABEL)) {
641 continue;
642 }
643
644 buffer.append("#").append(colName).append("\n");
645
646 if (!colName.startsWith("WN")) {
647 for (int offset = -slotSize; offset <= slotSize; offset++) {
648 buffer.append("U").append(++featNo).append(":")
649 .append("%x[").append(offset).append(",").append(i).append("]")
650 .append("\n");
651 }
652 }
653 else {
654 buffer.append("U").append(++featNo).append(":")
655 .append("%x[").append("0").append(",").append(i).append("]")
656 .append("\n");
657 }
658
659 if (DOUBLE_FEATURES.contains(colName)) {
660 for (int offset = -slotSize; offset <= slotSize - 1; offset++) {
661 buffer.append("U").append(++featNo).append(":")
662 .append("%x[").append(offset).append(",").append(i).append("]")
663 .append("/")
664 .append("%x[").append(offset + 1).append(",").append(i).append("]")
665 .append("\n");
666 }
667 }
668
669 if (TRIPLE_FEATURES.contains(colName)) {
670 for (int offset = -slotSize; offset <= slotSize - 2; offset++) {
671 buffer.append("U").append(++featNo).append(":")
672 .append("%x[").append(offset).append(",").append(i).append("]")
673 .append("/")
674 .append("%x[").append(offset + 1).append(",").append(i).append("]")
675 .append("/")
676 .append("%x[").append(offset + 2).append(",").append(i).append("]")
677 .append("\n");
678 }
679 }
680
681 buffer.append("\n");
682 }
683
684 buffer.append("#BIGRAMS\n");
685 buffer.append("B").append("\n");
686
687 templateWriter.write(buffer.toString());
688 templateWriter.close();
689 }
690
691 LOGGER.debug(columns.toString());
692
693 } catch (final Throwable ex) {
694 CommandLine.fail(ex);
695 }
696 }
697
698 private static String getWnFromTerm(Term term) {
699 String wnSense = term.getWordnetSense();
700 if (wnSense == null || wnSense.length() == 0) {
701 for (ExternalRef externalRef : term.getExternalRefs()) {
702 if (externalRef.getResource().equals("wn30-ukb")) {
703 wnSense = externalRef.getReference();
704 if (wnSense != null && wnSense.length() > 0) {
705 break;
706 }
707 }
708 }
709 }
710
711 return wnSense;
712 }
713
714 public static List<File> readList(File fileList, File baseFolder, @Nullable String replaceExtension) throws IOException {
715
716 List<File> ret = new ArrayList<>();
717
718 BufferedReader reader = null;
719 try {
720 reader = new BufferedReader(new FileReader(fileList));
721
722 String line;
723 while ((line = reader.readLine()) != null) {
724 line = line.trim();
725 if (line.length() == 0) {
726 continue;
727 }
728
729 String fileName = baseFolder.getAbsolutePath() + File.separator + line;
730
731 if (replaceExtension != null) {
732 fileName = fileName.replaceAll("\\.[^\\.]+$", "." + replaceExtension);
733 }
734
735 File file = new File(fileName);
736 if (!file.exists()) {
737 LOGGER.warn("File {} does not exist", fileName);
738 continue;
739 }
740
741 ret.add(file);
742 }
743 } catch (Exception e) {
744 LOGGER.error(e.getMessage());
745 } finally {
746 if (reader != null) {
747 reader.close();
748 }
749 }
750 return ret;
751 }
752
753 private static void writeFeats(KAFDocument document, BufferedWriter writer, String[] labels, boolean skipEmptySentences, Set<String> hypernyms, Type type, int slotSize) throws IOException {
754
755 char space = '\t';
756 OutputType outputType = OutputType.SINGLE;
757 boolean classBefore = false;
758 boolean skipEmptyFeatures = true;
759 String featurePrefix = "";
760
761 switch (type) {
762 case MALLET:
763 space = ' ';
764 break;
765 case MALLET_WINDOW:
766 space = ' ';
767 outputType = OutputType.COMPLETE;
768 break;
769 case CRFSUITE:
770 outputType = OutputType.COMPLETE;
771 classBefore = true;
772 break;
773 case YAMCHA:
774 skipEmptyFeatures = false;
775 break;
776 case WAPITI:
777 outputType = OutputType.COMPLETE;
778 featurePrefix = "u:";
779 break;
780 }
781
782 ArrayList<ArrayList<LinkedHashMap<String, String>>> sentences = extractFeats(document, labels, hypernyms, false);
783
784 String string1 = "";
785 String string2 = "";
786 if (classBefore) {
787 string1 = Character.toString(space);
788 }
789 else {
790 string2 = Character.toString(space);
791 }
792
793 string1 += featurePrefix;
794
795 StringBuffer bigBuffer = new StringBuffer();
796
797 for (ArrayList<LinkedHashMap<String, String>> sentence : sentences) {
798
799 boolean isAnnotated = false;
800 StringBuffer buffer = new StringBuffer();
801
802 for (int i = 0; i < sentence.size(); i++) {
803 LinkedHashMap<String, String> token = sentence.get(i);
804 String classification = token.get(DEFAULT_CLASSIFICATION_LABEL);
805
806 if (classBefore) {
807 buffer.append(classification);
808 }
809
810 switch (outputType) {
811 case SINGLE:
812
813 for (String key : token.keySet()) {
814 if (key.equals(DEFAULT_CLASSIFICATION_LABEL)) {
815 continue;
816 }
817
818 String value = token.get(key);
819
820 if (key.startsWith("WN")) {
821 buffer.append(string1).append(key).append(string2);
822 }
823 else {
824 if (!skipEmptyFeatures || !value.equals(DEFAULT_NONE)) {
825 buffer.append(string1).append(key).append(".").append(value).append(string2);
826 }
827 }
828 }
829
830 break;
831 case COMPLETE:
832
833
834 if (i == 0) {
835 buffer.append(string1).append("BOS").append(string2);
836 }
837
838
839 for (String key : token.keySet()) {
840 if (key.equals(DEFAULT_CLASSIFICATION_LABEL)) {
841 continue;
842 }
843
844
845
846 if (key.startsWith("WN")) {
847 buffer.append(string1).append(key).append(string2);
848 }
849 else {
850 for (int offset = -slotSize; offset <= slotSize; offset++) {
851 LinkedHashMap<String, String> thisToken;
852 try {
853 thisToken = sentence.get(i + offset);
854 } catch (IndexOutOfBoundsException e) {
855 continue;
856 }
857
858 String thisValue = thisToken.get(key);
859 if (thisValue == null) {
860 continue;
861 }
862 if (!skipEmptyFeatures || !thisValue.equals(DEFAULT_NONE)) {
863 buffer.append(string1)
864 .append("[").append(offset).append("]")
865 .append(key).append(".").append(thisValue)
866 .append(string2);
867 }
868 }
869 }
870 }
871
872
873 if (i == sentence.size() - 1) {
874 buffer.append(string1).append("EOS").append(string2);
875 }
876
877 break;
878 }
879
880 if (!classBefore) {
881 buffer.append(classification);
882 }
883
884 buffer.append("\n");
885 }
886 buffer.append("\n");
887 if (!skipEmptySentences || isAnnotated) {
888 bigBuffer.append(buffer.toString());
889 }
890 }
891
892 writer.write(bigBuffer.toString());
893 }
894 }