1 package eu.fbk.dkm.pikes.raid;
2
3 import com.google.common.collect.Lists;
4 import com.google.common.collect.Multimap;
5 import eu.fbk.dkm.pikes.resources.SenticNet;
6 import eu.fbk.dkm.pikes.resources.SubjectivityLexicon;
7 import eu.fbk.dkm.pikes.resources.WordNet;
8 import eu.fbk.dkm.pikes.resources.mpqa.CorpusAnnotator;
9 import eu.fbk.utils.analysis.stemmer.Stemmer;
10 import eu.fbk.utils.analysis.stemmer.StemmerFactory;
11 import eu.fbk.utils.core.CommandLine;
12 import eu.fbk.utils.svm.Classifier;
13 import eu.fbk.utils.svm.LabelledVector;
14 import eu.fbk.utils.svm.Vector;
15 import ixa.kaflib.KAFDocument;
16 import ixa.kaflib.Opinion;
17 import ixa.kaflib.Term;
18 import org.apache.commons.io.FileUtils;
19 import org.slf4j.LoggerFactory;
20
21 import java.io.File;
22 import java.util.*;
23
24
25
26
27
28 public class SenticSubjlexTraining {
29
30 private static final org.slf4j.Logger LOGGER = LoggerFactory.getLogger(SenticSubjlexTraining.class);
31 private static final Integer MAX_DOCS = 200;
32
33 public static void main(String[] args) {
34 try {
35 final CommandLine cmd = CommandLine
36 .parser()
37 .withName("yamcha-extractor")
38 .withHeader("Extract YAMCHA training set")
39 .withOption("i", "input-folder", "the folder of the corpus", "DIR", CommandLine.Type.DIRECTORY, true, false, true)
40 .withOption("w", "wordnet-path", "WordNet dict folder", "DIR", CommandLine.Type.DIRECTORY, true, false, true)
41
42 .withOption("e", "extensions", String.format("Input extensions (default %s)", CorpusAnnotator.DEFAULT_NAF_EXTENSIONS), "EXTS", CommandLine.Type.STRING, true, true, false)
43 .withLogger(LoggerFactory.getLogger("eu.fbk")).parse(args);
44
45 File mainFolder = cmd.getOptionValue("i", File.class);
46 File wnFolder = cmd.getOptionValue("w", File.class);
47
48
49 List<String> extensions = null;
50 if (cmd.hasOption("e")) {
51 extensions = cmd.getOptionValues("e", String.class);
52 }
53 if (extensions == null) {
54 extensions = CorpusAnnotator.DEFAULT_NAF_EXTENSIONS;
55 }
56
57 Stemmer stemmer = StemmerFactory.getInstance(Locale.US);
58
59
60
61
62
63
64
65
66
67
68
69
70 SenticNet senticNet = SenticNet.getInstance();
71 SubjectivityLexicon subjectivityLexicon = SubjectivityLexicon.getInstance();
72
73 WordNet.setPath(wnFolder.getAbsolutePath());
74 WordNet.init();
75
76 int numFiles = 0;
77
78 LOGGER.info("Loading file list");
79
80 if (!mainFolder.exists()) {
81 LOGGER.error("Folder {} does not exist", mainFolder.getAbsolutePath());
82 }
83
84 Iterator<File> fileIterator;
85
86 Classifier.Parameters parameters = Classifier.Parameters.forLinearL1LossL2Reg(2, new float[]{1, 2}, 1.0f, 1.0f);
87 List<LabelledVector> trainingSet = new ArrayList<>();
88
89
90
91 fileIterator = FileUtils.iterateFiles(mainFolder, extensions.toArray(new String[extensions.size()]), true);
92 List<File> files = Lists.newArrayList(fileIterator);
93 Collections.shuffle(files);
94 for (File file : files) {
95 numFiles++;
96 if (MAX_DOCS != null && numFiles > MAX_DOCS) {
97 break;
98 }
99
100 LOGGER.info(String.format("Loading file %s", file));
101 KAFDocument document = KAFDocument.createFromFile(file);
102
103
104 Multimap<Term, SenticNet.Lexeme> senticnetMM = senticNet.match(document, document.getTerms());
105 Multimap<Term, SubjectivityLexicon.Lexeme> subjectivityMM = subjectivityLexicon.match(document, document.getTerms());
106
107 HashSet<Term> opinionTerms = new HashSet<>();
108
109 for (Opinion opinion : document.getOpinions()) {
110 if (opinion.getOpinionExpression() == null) {
111 continue;
112 }
113
114 if (opinion.getLabel() == null) {
115 continue;
116 }
117
118 if (!opinion.getLabel().contains("gold")) {
119 continue;
120 }
121
122 if (opinion.getOpinionExpression().getSpan() == null) {
123 continue;
124 }
125
126 for (Term term : opinion.getOpinionExpression().getSpan().getTargets()) {
127 opinionTerms.add(term);
128 }
129 }
130
131 for (Term term : document.getTerms()) {
132 final LabelledVector.Builder builder = Vector.builder();
133
134 boolean inDataset = false;
135
136 if (!senticnetMM.get(term).isEmpty()) {
137 builder.set("SENTIC", true);
138 inDataset = true;
139 }
140 if (!subjectivityMM.get(term).isEmpty()) {
141 builder.set("SUBJLEX", true);
142 inDataset = true;
143 }
144
145 if (!inDataset) {
146 continue;
147 }
148
149 builder.set("LEMMA." + term.getLemma(), true);
150 builder.set("MORPHO." + term.getMorphofeat(), true);
151 builder.set("POS." + term.getPos(), true);
152
153 int label = 0;
154 if (opinionTerms.contains(term)) {
155 label = 1;
156 }
157 LabelledVector vector = builder.build().label(label, null);
158 trainingSet.add(vector);
159 }
160 }
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193 List<Classifier.Parameters> grid = parameters.grid(25, 10);
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208 } catch (final Throwable ex) {
209 CommandLine.fail(ex);
210 }
211 }
212 }