1   package eu.fbk.dkm.pikes.raid;
2   
3   import eu.fbk.dkm.pikes.naflib.Corpus;
4   import eu.fbk.dkm.pikes.raid.mdfsa.APIManager;
5   import eu.fbk.dkm.pikes.resources.mpqa.CorpusAnnotator;
6   import eu.fbk.utils.core.CommandLine;
7   import ixa.kaflib.KAFDocument;
8   import ixa.kaflib.Opinion;
9   import ixa.kaflib.Span;
10  import ixa.kaflib.Term;
11  import org.slf4j.LoggerFactory;
12  
13  import java.io.*;
14  import java.nio.file.Path;
15  import java.util.HashMap;
16  import java.util.List;
17  import java.util.Properties;
18  
19  import static eu.fbk.dkm.pikes.raid.CreateTrainingForExpression.readList;
20  
21  /**
22   * Created by alessio on 17/04/15.
23   */
24  
25  public class UpdateNafsWithResults {
26  
27  	private static final org.slf4j.Logger LOGGER = LoggerFactory.getLogger(UpdateNafsWithResults.class);
28  	private static final Integer MAX_DOCS = 10;
29  	private static final String DEFAULT_LABEL = "gold";
30  
31  	private static Long DEFAULT_SEED = 2l;
32  	private static String DEFAULT_CLASSIFICATION_LABEL = "_CLASS";
33  	private static Float DEFAULT_SPLIT = 0.75f;
34  
35  	private static Float DEFAULT_NEG_POL = -0.2f;
36  	private static Float DEFAULT_POS_POL = 0.2f;
37  
38  	public static void main(String[] args) {
39  		try {
40  			final CommandLine cmd = CommandLine
41  					.parser()
42  					.withName("yamcha-extractor")
43  					.withHeader("Extract YAMCHA training set")
44  					.withOption("i", "input-folder", "the folder of the corpus", "DIR", CommandLine.Type.DIRECTORY, true, false, true)
45  					.withOption("o", "output-folder", "output folder", "DIR", CommandLine.Type.DIRECTORY, true, false, true)
46  					.withOption("r", "results-file", "CRF++ results file", "FILE", CommandLine.Type.FILE_EXISTING, true, false, true)
47  					.withOption("l", "label", "label to use", "LABEL", CommandLine.Type.STRING, true, false, true)
48  					.withOption("e", "extensions", String.format("Input extensions (default %s)", CorpusAnnotator.DEFAULT_NAF_EXTENSIONS), "EXTS", CommandLine.Type.STRING, true, true, false)
49  					.withOption(null, "sentiment-model", "MDFSA model", "FILE", CommandLine.Type.FILE_EXISTING, true, false, false)
50  					.withOption(null, "sentiment-properties", "MDFSA properties file", "FILE", CommandLine.Type.FILE_EXISTING, true, false, false)
51  					.withOption(null, "sentiment-neg-limit", String.format("MDFSA negative limit (default %f)", DEFAULT_NEG_POL), "NUM", CommandLine.Type.FLOAT, true, false, false)
52  					.withOption(null, "sentiment-pos-limit", String.format("MDFSA positive limit (default %f)", DEFAULT_POS_POL), "NUM", CommandLine.Type.FLOAT, true, false, false)
53  					.withOption(null, "seed", "Seed", "NUM", CommandLine.Type.FLOAT, true, false, false)
54  					.withOption(null, "split", "Split part (training)", "NUM", CommandLine.Type.POSITIVE_FLOAT, true, false, false)
55  					.withOption(null, "train-list", "Training set file list", "FILE", CommandLine.Type.FILE_EXISTING, true, false, false)
56  					.withOption(null, "test-list", "Test set file list", "FILE", CommandLine.Type.FILE_EXISTING, true, false, false)
57  					.withOption(null, "fake", "Fake mode")
58  					.withLogger(LoggerFactory.getLogger("eu.fbk")).parse(args);
59  
60  			File mainFolder = cmd.getOptionValue("input-folder", File.class);
61  			File outputFolder = cmd.getOptionValue("output-folder", File.class);
62  
63  			File resultsFile = cmd.getOptionValue("results-file", File.class);
64  			String label = cmd.getOptionValue("label", String.class);
65  			Float split = cmd.getOptionValue("split", Float.class, DEFAULT_SPLIT);
66  			Long seed = cmd.getOptionValue("seed", Long.class, DEFAULT_SEED);
67  
68  			List<String> extensions = null;
69  			if (cmd.hasOption("extensions")) {
70  				extensions = cmd.getOptionValues("extensions", String.class);
71  			}
72  			if (extensions == null) {
73  				extensions = CorpusAnnotator.DEFAULT_NAF_EXTENSIONS;
74  			}
75  
76  			File trainList = cmd.getOptionValue("train-list", File.class);
77  			File testList = cmd.getOptionValue("test-list", File.class);
78  
79  			File sentimentModel = cmd.getOptionValue("sentiment-model", File.class);
80  			File sentimentProperties = cmd.getOptionValue("sentiment-properties", File.class);
81  
82  			Float negLimit = cmd.getOptionValue("sentiment-neg-limit", Float.class, DEFAULT_NEG_POL);
83  			Float posLimit = cmd.getOptionValue("sentiment-pos-limit", Float.class, DEFAULT_POS_POL);
84  
85  			boolean fakeMode = cmd.hasOption("fake");
86  
87  			if ((trainList != null && testList == null) || (testList != null && trainList == null)) {
88  				throw new CommandLine.Exception("Train list and test list must be both declared or both missing");
89  			}
90  
91  			// ---
92  
93  			if (!outputFolder.exists()) {
94  				boolean mkdirs = outputFolder.mkdirs();
95  				if (!mkdirs) {
96  					throw new Exception(String.format("Unable to create folder %s", outputFolder.getAbsolutePath()));
97  				}
98  			}
99  
100 			APIManager am = null;
101 			if (sentimentModel != null && sentimentProperties != null) {
102 				LOGGER.info("Loading sentiment models");
103 
104 				Properties prp = new Properties();
105 				InputStream iS = new FileInputStream(sentimentProperties);
106 				prp.load(iS);
107 
108 				am = new APIManager(prp);
109 				am.loadModel(sentimentModel.getAbsolutePath());
110 			}
111 
112 
113 			LOGGER.info("Parsing corpus");
114 			Corpus[] corpuses = new Corpus[2];
115 			if (trainList != null) {
116 				List<File> trainFiles = readList(trainList, mainFolder, "naf");
117 				List<File> testFiles = readList(testList, mainFolder, "naf");
118 				corpuses[0] = Corpus.create(false, trainFiles);
119 				corpuses[1] = Corpus.create(false, testFiles);
120 			}
121 			else {
122 				Corpus myCorpus = Corpus.create(false, mainFolder);
123 				corpuses = myCorpus.split(seed, split, 1.0f - split);
124 			}
125 
126 			BufferedReader reader = new BufferedReader(new FileReader(resultsFile));
127 			HashMap<Integer, Integer> startIndex = new HashMap<>();
128 			HashMap<Integer, Integer> endIndex = new HashMap<>();
129 
130 			int exprID = -1;
131 			int j = -1;
132 
133 			String line;
134 			while ((line = reader.readLine()) != null) {
135 				String[] parts = line.split("\\s+");
136 				if (parts.length <= 1) {
137 					continue;
138 				}
139 
140 				String res = parts[parts.length - 1];
141 				j++;
142 
143 				if (res.startsWith("B")) {
144 					exprID++;
145 					endIndex.put(exprID, j);
146 					startIndex.put(j, exprID);
147 				}
148 				if (res.startsWith("I")) {
149 					if (endIndex.get(exprID) == j - 1) {
150 						endIndex.put(exprID, j);
151 					}
152 				}
153 			}
154 			LOGGER.info("Total tokens in the test: {}", j + 1);
155 			reader.close();
156 
157 			j = -1;
158 			for (Path file : corpuses[1].files()) {
159 
160 				String baseFileName = file.toFile().getName();
161 				String outputFile = outputFolder.getAbsolutePath() + File.separator + baseFileName;
162 
163 				LOGGER.debug(baseFileName);
164 
165 				KAFDocument document = corpuses[1].get(file);
166 				List<Term> terms = document.getTerms();
167 				for (int i = 0; i < terms.size(); i++) {
168 					j++;
169 
170 					if (startIndex.keySet().contains(j)) {
171 						int length = endIndex.get(startIndex.get(j)) - j + 1;
172 						Span<Term> termSpan = KAFDocument.newTermSpan();
173 						for (int k = 0; k < length; k++) {
174 							Term term;
175 							try {
176 								term = terms.get(i + k);
177 							} catch (Exception e) {
178 								LOGGER.warn("Error in token {} ({}) in file {}", i + k, j + 1, baseFileName);
179 								continue;
180 							}
181 							termSpan.addTarget(term);
182 						}
183 
184 						Opinion opinion = document.newOpinion();
185 						opinion.setLabel(label);
186 						Opinion.OpinionExpression opinionExpression = opinion.createOpinionExpression(termSpan);
187 
188 						// Sentiment
189 						if (am != null) {
190 
191 //							System.out.println(baseFileName);
192 //
193 //							HashSet<Term> containingTerms = new HashSet<>();
194 //							Term firstTerm = termSpan.getFirstTarget();
195 //							containingTerms.add(firstTerm);
196 //							System.out.println(document.getTermsByDepAncestors(Collections.singletonList(firstTerm)));
197 //							containingTerms.addAll(document.getTermsByDepAncestors(Collections.singletonList(firstTerm)));
198 //
199 //							System.out.println(containingTerms);
200 //							System.out.println();
201 
202 //							List<Dep> deps = document.getDepsFromTerm(firstTerm);
203 //							for (Dep dep : deps) {
204 //								containingTerms.add(dep.)
205 //							}
206 
207 
208 //							while (!containingTerms.containsAll(termSpan.getTargets())) {
209 //
210 //							}
211 
212 //							for (Term term : termSpan.getTargets()) {
213 //								final Dep dep = document.getDepToTerm(term);
214 //								System.out.println(term);
215 //								System.out.println(dep);
216 //
217 //								// Sopra
218 //								// System.out.println(dep.getFrom());
219 //
220 //								System.out.println();
221 //							}
222 
223 
224 //							Span<Term> normalizedSpan = NAFUtils.normalizeSpan(document, termSpan);
225 //							Term head = normalizedSpan.getHead();
226 //
227 //							List<Dep> deps = document.getDepsBySent(head.getSent());
228 //							String[] depList = new String[deps.size()];
229 //							for (int z = 0; z < deps.size(); z++) {
230 //								Dep dep = deps.get(z);
231 //								depList[z] = dep.toString();
232 //							}
233 //
234 //							Integer headID = Integer.parseInt(head.getId().replaceAll("[^0-9]", ""));
235 
236 //							System.out.println(deps);
237 //							System.out.println(termSpan.getStr());
238 //							System.out.println(termSpan.size());
239 //							System.out.println(termSpan.getHead());
240 //							System.out.println(normalizedSpan.getStr());
241 //							System.out.println(normalizedSpan.size());
242 //							System.out.println(normalizedSpan.getHead());
243 
244 //							Set<Term> allTerms = document.getTermsByDepAncestors(Collections.singletonList(head));
245 
246 //							ArrayList<Integer> idsToRemove = new ArrayList<>();
247 //							for (Term term : allTerms) {
248 //								Set<Term> children = document.getTermsByDepAncestors(Collections.singletonList(term));
249 //								children.retainAll(normalizedSpan.getTargets());
250 //								if (children.size() == 0) {
251 //									Integer idToRemove = Integer.parseInt(term.getId().replaceAll("[^0-9]", ""));
252 //									idsToRemove.add(idToRemove);
253 //								}
254 //							}
255 //
256 //							System.out.println(Arrays.toString(depList));
257 //							System.out.println(headID);
258 //							System.out.println(idsToRemove);
259 //							double computedPolarity = am.evaluateSentence(depList, headID, idsToRemove);
260 //							System.out.println(computedPolarity);
261 //							System.out.println();
262 
263 							double computedPolarity = am.evaluateSentence(termSpan.getStr());
264 
265 							String polarity = "Neutral";
266 							if (computedPolarity != -2.0) {
267 								if (computedPolarity < negLimit) {
268 									polarity = "Negative";
269 								}
270 								if (computedPolarity > posLimit) {
271 									polarity = "Positive";
272 								}
273 							}
274 
275 							opinionExpression.setPolarity(polarity);
276 //							System.out.println(termSpan.getStr());
277 //							System.out.println();
278 						}
279 					}
280 				}
281 
282 				if (!fakeMode) {
283 					document.save(outputFile);
284 				}
285 			}
286 			LOGGER.info("Total tokens in the NAFs: {}", j + 1);
287 
288 //
289 //
290 //
291 //			File templateFile = new File(outputFolder.getAbsolutePath() + File.separator + "template.crf");
292 //
293 //			BufferedWriter trainWriter = new BufferedWriter(new FileWriter(trainDataFile));
294 //			BufferedWriter testWriter = new BufferedWriter(new FileWriter(testDataFile));
295 //
296 //			LOGGER.info("Loading resources");
297 //			senticNet = SenticNet.getInstance();
298 //			subjectivityLexicon = SubjectivityLexicon.getInstance();
299 //			stemmer = StemmerFactory.getInstance(Locale.US);
300 //			intensities = Intensities.getInstance();
301 //
302 //			if (wnFolder != null) {
303 //				WordNet.setPath(wnFolder.getAbsolutePath());
304 //				WordNet.init();
305 //			}
306 //
307 //			LOGGER.info("Parsing corpus");
308 //			Corpus myCorpus = Corpus.create(false, mainFolder);
309 //			Corpus[] corpuses = myCorpus.split(seed, split, 1.0f - split);
310 //
311 //			ArrayList<String> columns = new ArrayList<>();
312 //
313 //			// Populate columns
314 //			for (Path file : corpuses[0].files()) {
315 //				KAFDocument document = corpuses[0].get(file);
316 //				ArrayList<ArrayList<LinkedHashMap<String, String>>> sentences = extractFeats(document, labels);
317 //				if (columns.size() == 0 && sentences.size() > 0 && sentences.get(0).size() > 0) {
318 //					for (String key : sentences.get(0).get(0).keySet()) {
319 //						if (!key.equals(DEFAULT_CLASSIFICATION_LABEL)) {
320 //							columns.add(key);
321 //						}
322 //					}
323 //					break;
324 //				}
325 //			}
326 //
327 //			// Train
328 //			for (Path file : corpuses[0].files()) {
329 //				KAFDocument document = corpuses[0].get(file);
330 //				writeFeats(document, trainWriter, space, labels, skipEmptyTrain);
331 //			}
332 //
333 //			// Test
334 //			for (Path file : corpuses[1].files()) {
335 //				KAFDocument document = corpuses[1].get(file);
336 //				writeFeats(document, testWriter, space, labels, skipEmptyTest);
337 //			}
338 //
339 //			trainWriter.close();
340 //			testWriter.close();
341 //
342 //			BufferedWriter templateWriter = new BufferedWriter(new FileWriter(templateFile));
343 //			StringBuffer buffer = new StringBuffer();
344 //
345 //			int featNo = 0;
346 //			for (int i = 0; i < columns.size(); i++) {
347 //				String colName = columns.get(i);
348 //
349 //				if (colName.equals(DEFAULT_CLASSIFICATION_LABEL)) {
350 //					continue;
351 //				}
352 //
353 //				buffer.append("#").append(colName).append("\n");
354 //
355 //				for (int offset = -slotSize; offset <= slotSize; offset++) {
356 //					buffer.append("U").append(++featNo).append(":")
357 //							.append("%x[").append(offset).append(",").append(i).append("]")
358 //							.append("\n");
359 //				}
360 //
361 //				if (DOUBLE_FEATURES.contains(colName)) {
362 //					for (int offset = -slotSize; offset <= slotSize - 1; offset++) {
363 //						buffer.append("U").append(++featNo).append(":")
364 //								.append("%x[").append(offset).append(",").append(i).append("]")
365 //								.append("/")
366 //								.append("%x[").append(offset + 1).append(",").append(i).append("]")
367 //								.append("\n");
368 //					}
369 //				}
370 //
371 //				if (TRIPLE_FEATURES.contains(colName)) {
372 //					for (int offset = -slotSize; offset <= slotSize - 2; offset++) {
373 //						buffer.append("U").append(++featNo).append(":")
374 //								.append("%x[").append(offset).append(",").append(i).append("]")
375 //								.append("/")
376 //								.append("%x[").append(offset + 1).append(",").append(i).append("]")
377 //								.append("/")
378 //								.append("%x[").append(offset + 2).append(",").append(i).append("]")
379 //								.append("\n");
380 //					}
381 //				}
382 //
383 //				buffer.append("\n");
384 //			}
385 //
386 //			buffer.append("#BIGRAMS\n");
387 //			buffer.append("B").append("\n");
388 //
389 //			templateWriter.write(buffer.toString());
390 //			templateWriter.close();
391 //
392 //			LOGGER.debug(columns.toString());
393 
394 		} catch (final Throwable ex) {
395 			CommandLine.fail(ex);
396 		}
397 	}
398 
399 }