1 package eu.fbk.dkm.pikes.raid;
2
3 import eu.fbk.dkm.pikes.naflib.Corpus;
4 import eu.fbk.dkm.pikes.raid.mdfsa.APIManager;
5 import eu.fbk.dkm.pikes.resources.mpqa.CorpusAnnotator;
6 import eu.fbk.utils.core.CommandLine;
7 import ixa.kaflib.KAFDocument;
8 import ixa.kaflib.Opinion;
9 import ixa.kaflib.Span;
10 import ixa.kaflib.Term;
11 import org.slf4j.LoggerFactory;
12
13 import java.io.*;
14 import java.nio.file.Path;
15 import java.util.HashMap;
16 import java.util.List;
17 import java.util.Properties;
18
19 import static eu.fbk.dkm.pikes.raid.CreateTrainingForExpression.readList;
20
21
22
23
24
25 public class UpdateNafsWithResults {
26
27 private static final org.slf4j.Logger LOGGER = LoggerFactory.getLogger(UpdateNafsWithResults.class);
28 private static final Integer MAX_DOCS = 10;
29 private static final String DEFAULT_LABEL = "gold";
30
31 private static Long DEFAULT_SEED = 2l;
32 private static String DEFAULT_CLASSIFICATION_LABEL = "_CLASS";
33 private static Float DEFAULT_SPLIT = 0.75f;
34
35 private static Float DEFAULT_NEG_POL = -0.2f;
36 private static Float DEFAULT_POS_POL = 0.2f;
37
38 public static void main(String[] args) {
39 try {
40 final CommandLine cmd = CommandLine
41 .parser()
42 .withName("yamcha-extractor")
43 .withHeader("Extract YAMCHA training set")
44 .withOption("i", "input-folder", "the folder of the corpus", "DIR", CommandLine.Type.DIRECTORY, true, false, true)
45 .withOption("o", "output-folder", "output folder", "DIR", CommandLine.Type.DIRECTORY, true, false, true)
46 .withOption("r", "results-file", "CRF++ results file", "FILE", CommandLine.Type.FILE_EXISTING, true, false, true)
47 .withOption("l", "label", "label to use", "LABEL", CommandLine.Type.STRING, true, false, true)
48 .withOption("e", "extensions", String.format("Input extensions (default %s)", CorpusAnnotator.DEFAULT_NAF_EXTENSIONS), "EXTS", CommandLine.Type.STRING, true, true, false)
49 .withOption(null, "sentiment-model", "MDFSA model", "FILE", CommandLine.Type.FILE_EXISTING, true, false, false)
50 .withOption(null, "sentiment-properties", "MDFSA properties file", "FILE", CommandLine.Type.FILE_EXISTING, true, false, false)
51 .withOption(null, "sentiment-neg-limit", String.format("MDFSA negative limit (default %f)", DEFAULT_NEG_POL), "NUM", CommandLine.Type.FLOAT, true, false, false)
52 .withOption(null, "sentiment-pos-limit", String.format("MDFSA positive limit (default %f)", DEFAULT_POS_POL), "NUM", CommandLine.Type.FLOAT, true, false, false)
53 .withOption(null, "seed", "Seed", "NUM", CommandLine.Type.FLOAT, true, false, false)
54 .withOption(null, "split", "Split part (training)", "NUM", CommandLine.Type.POSITIVE_FLOAT, true, false, false)
55 .withOption(null, "train-list", "Training set file list", "FILE", CommandLine.Type.FILE_EXISTING, true, false, false)
56 .withOption(null, "test-list", "Test set file list", "FILE", CommandLine.Type.FILE_EXISTING, true, false, false)
57 .withOption(null, "fake", "Fake mode")
58 .withLogger(LoggerFactory.getLogger("eu.fbk")).parse(args);
59
60 File mainFolder = cmd.getOptionValue("input-folder", File.class);
61 File outputFolder = cmd.getOptionValue("output-folder", File.class);
62
63 File resultsFile = cmd.getOptionValue("results-file", File.class);
64 String label = cmd.getOptionValue("label", String.class);
65 Float split = cmd.getOptionValue("split", Float.class, DEFAULT_SPLIT);
66 Long seed = cmd.getOptionValue("seed", Long.class, DEFAULT_SEED);
67
68 List<String> extensions = null;
69 if (cmd.hasOption("extensions")) {
70 extensions = cmd.getOptionValues("extensions", String.class);
71 }
72 if (extensions == null) {
73 extensions = CorpusAnnotator.DEFAULT_NAF_EXTENSIONS;
74 }
75
76 File trainList = cmd.getOptionValue("train-list", File.class);
77 File testList = cmd.getOptionValue("test-list", File.class);
78
79 File sentimentModel = cmd.getOptionValue("sentiment-model", File.class);
80 File sentimentProperties = cmd.getOptionValue("sentiment-properties", File.class);
81
82 Float negLimit = cmd.getOptionValue("sentiment-neg-limit", Float.class, DEFAULT_NEG_POL);
83 Float posLimit = cmd.getOptionValue("sentiment-pos-limit", Float.class, DEFAULT_POS_POL);
84
85 boolean fakeMode = cmd.hasOption("fake");
86
87 if ((trainList != null && testList == null) || (testList != null && trainList == null)) {
88 throw new CommandLine.Exception("Train list and test list must be both declared or both missing");
89 }
90
91
92
93 if (!outputFolder.exists()) {
94 boolean mkdirs = outputFolder.mkdirs();
95 if (!mkdirs) {
96 throw new Exception(String.format("Unable to create folder %s", outputFolder.getAbsolutePath()));
97 }
98 }
99
100 APIManager am = null;
101 if (sentimentModel != null && sentimentProperties != null) {
102 LOGGER.info("Loading sentiment models");
103
104 Properties prp = new Properties();
105 InputStream iS = new FileInputStream(sentimentProperties);
106 prp.load(iS);
107
108 am = new APIManager(prp);
109 am.loadModel(sentimentModel.getAbsolutePath());
110 }
111
112
113 LOGGER.info("Parsing corpus");
114 Corpus[] corpuses = new Corpus[2];
115 if (trainList != null) {
116 List<File> trainFiles = readList(trainList, mainFolder, "naf");
117 List<File> testFiles = readList(testList, mainFolder, "naf");
118 corpuses[0] = Corpus.create(false, trainFiles);
119 corpuses[1] = Corpus.create(false, testFiles);
120 }
121 else {
122 Corpus myCorpus = Corpus.create(false, mainFolder);
123 corpuses = myCorpus.split(seed, split, 1.0f - split);
124 }
125
126 BufferedReader reader = new BufferedReader(new FileReader(resultsFile));
127 HashMap<Integer, Integer> startIndex = new HashMap<>();
128 HashMap<Integer, Integer> endIndex = new HashMap<>();
129
130 int exprID = -1;
131 int j = -1;
132
133 String line;
134 while ((line = reader.readLine()) != null) {
135 String[] parts = line.split("\\s+");
136 if (parts.length <= 1) {
137 continue;
138 }
139
140 String res = parts[parts.length - 1];
141 j++;
142
143 if (res.startsWith("B")) {
144 exprID++;
145 endIndex.put(exprID, j);
146 startIndex.put(j, exprID);
147 }
148 if (res.startsWith("I")) {
149 if (endIndex.get(exprID) == j - 1) {
150 endIndex.put(exprID, j);
151 }
152 }
153 }
154 LOGGER.info("Total tokens in the test: {}", j + 1);
155 reader.close();
156
157 j = -1;
158 for (Path file : corpuses[1].files()) {
159
160 String baseFileName = file.toFile().getName();
161 String outputFile = outputFolder.getAbsolutePath() + File.separator + baseFileName;
162
163 LOGGER.debug(baseFileName);
164
165 KAFDocument document = corpuses[1].get(file);
166 List<Term> terms = document.getTerms();
167 for (int i = 0; i < terms.size(); i++) {
168 j++;
169
170 if (startIndex.keySet().contains(j)) {
171 int length = endIndex.get(startIndex.get(j)) - j + 1;
172 Span<Term> termSpan = KAFDocument.newTermSpan();
173 for (int k = 0; k < length; k++) {
174 Term term;
175 try {
176 term = terms.get(i + k);
177 } catch (Exception e) {
178 LOGGER.warn("Error in token {} ({}) in file {}", i + k, j + 1, baseFileName);
179 continue;
180 }
181 termSpan.addTarget(term);
182 }
183
184 Opinion opinion = document.newOpinion();
185 opinion.setLabel(label);
186 Opinion.OpinionExpression opinionExpression = opinion.createOpinionExpression(termSpan);
187
188
189 if (am != null) {
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263 double computedPolarity = am.evaluateSentence(termSpan.getStr());
264
265 String polarity = "Neutral";
266 if (computedPolarity != -2.0) {
267 if (computedPolarity < negLimit) {
268 polarity = "Negative";
269 }
270 if (computedPolarity > posLimit) {
271 polarity = "Positive";
272 }
273 }
274
275 opinionExpression.setPolarity(polarity);
276
277
278 }
279 }
280 }
281
282 if (!fakeMode) {
283 document.save(outputFile);
284 }
285 }
286 LOGGER.info("Total tokens in the NAFs: {}", j + 1);
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394 } catch (final Throwable ex) {
395 CommandLine.fail(ex);
396 }
397 }
398
399 }