1   package eu.fbk.dkm.pikes.raid;
2   
3   import com.google.common.collect.Lists;
4   import com.google.common.collect.Multimap;
5   import eu.fbk.dkm.pikes.resources.SenticNet;
6   import eu.fbk.dkm.pikes.resources.SubjectivityLexicon;
7   import eu.fbk.dkm.pikes.resources.WordNet;
8   import eu.fbk.dkm.pikes.resources.mpqa.CorpusAnnotator;
9   import eu.fbk.utils.analysis.stemmer.Stemmer;
10  import eu.fbk.utils.analysis.stemmer.StemmerFactory;
11  import eu.fbk.utils.core.CommandLine;
12  import eu.fbk.utils.svm.Classifier;
13  import eu.fbk.utils.svm.LabelledVector;
14  import eu.fbk.utils.svm.Vector;
15  import ixa.kaflib.KAFDocument;
16  import ixa.kaflib.Opinion;
17  import ixa.kaflib.Term;
18  import org.apache.commons.io.FileUtils;
19  import org.slf4j.LoggerFactory;
20  
21  import java.io.File;
22  import java.util.*;
23  
24  /**
25   * Created by alessio on 17/04/15.
26   */
27  
28  public class SenticSubjlexTraining {
29  
30  	private static final org.slf4j.Logger LOGGER = LoggerFactory.getLogger(SenticSubjlexTraining.class);
31  	private static final Integer MAX_DOCS = 200;
32  
33  	public static void main(String[] args) {
34  		try {
35  			final CommandLine cmd = CommandLine
36  					.parser()
37  					.withName("yamcha-extractor")
38  					.withHeader("Extract YAMCHA training set")
39  					.withOption("i", "input-folder", "the folder of the corpus", "DIR", CommandLine.Type.DIRECTORY, true, false, true)
40  					.withOption("w", "wordnet-path", "WordNet dict folder", "DIR", CommandLine.Type.DIRECTORY, true, false, true)
41  //					.withOption("o", "output-folder", "output folder", "DIR", CommandLine.Type.DIRECTORY, true, false, true)
42  					.withOption("e", "extensions", String.format("Input extensions (default %s)", CorpusAnnotator.DEFAULT_NAF_EXTENSIONS), "EXTS", CommandLine.Type.STRING, true, true, false)
43  					.withLogger(LoggerFactory.getLogger("eu.fbk")).parse(args);
44  
45  			File mainFolder = cmd.getOptionValue("i", File.class);
46  			File wnFolder = cmd.getOptionValue("w", File.class);
47  //			File outputFolder = cmd.getOptionValue("o", File.class);
48  
49  			List<String> extensions = null;
50  			if (cmd.hasOption("e")) {
51  				extensions = cmd.getOptionValues("e", String.class);
52  			}
53  			if (extensions == null) {
54  				extensions = CorpusAnnotator.DEFAULT_NAF_EXTENSIONS;
55  			}
56  
57  			Stemmer stemmer = StemmerFactory.getInstance(Locale.US);
58  
59  //			if (!outputFolder.exists()) {
60  //				boolean createdOutputFolder = outputFolder.mkdirs();
61  //				if (!createdOutputFolder) {
62  //					LOGGER.error("Unable to create {}", outputFolder.getAbsolutePath());
63  //					System.exit(1);
64  //				}
65  //			}
66  //
67  //			File trainDataFile = new File(outputFolder.getAbsolutePath() + File.separator + "data.train");
68  //			BufferedWriter writer = new BufferedWriter(new FileWriter(trainDataFile));
69  
70  			SenticNet senticNet = SenticNet.getInstance();
71  			SubjectivityLexicon subjectivityLexicon = SubjectivityLexicon.getInstance();
72  
73  			WordNet.setPath(wnFolder.getAbsolutePath());
74  			WordNet.init();
75  
76  			int numFiles = 0;
77  
78  			LOGGER.info("Loading file list");
79  
80  			if (!mainFolder.exists()) {
81  				LOGGER.error("Folder {} does not exist", mainFolder.getAbsolutePath());
82  			}
83  
84  			Iterator<File> fileIterator;
85  //			Classifier.Parameters parameters = Classifier.Parameters.forSVMLinearKernel(2, new float[]{1, 2.5f}, 1.0f);
86  			Classifier.Parameters parameters = Classifier.Parameters.forLinearL1LossL2Reg(2, new float[]{1, 2}, 1.0f, 1.0f);
87  			List<LabelledVector> trainingSet = new ArrayList<>();
88  
89  //			HashMap<String, KAFDocument> documents = new HashMap<>();
90  
91  			fileIterator = FileUtils.iterateFiles(mainFolder, extensions.toArray(new String[extensions.size()]), true);
92  			List<File> files = Lists.newArrayList(fileIterator);
93  			Collections.shuffle(files);
94  			for (File file : files) {
95  				numFiles++;
96  				if (MAX_DOCS != null && numFiles > MAX_DOCS) {
97  					break;
98  				}
99  
100 				LOGGER.info(String.format("Loading file %s", file));
101 				KAFDocument document = KAFDocument.createFromFile(file);
102 //				documents.put(file.getAbsolutePath(), document);
103 
104 				Multimap<Term, SenticNet.Lexeme> senticnetMM = senticNet.match(document, document.getTerms());
105 				Multimap<Term, SubjectivityLexicon.Lexeme> subjectivityMM = subjectivityLexicon.match(document, document.getTerms());
106 
107 				HashSet<Term> opinionTerms = new HashSet<>();
108 
109 				for (Opinion opinion : document.getOpinions()) {
110 					if (opinion.getOpinionExpression() == null) {
111 						continue;
112 					}
113 
114 					if (opinion.getLabel() == null) {
115 						continue;
116 					}
117 
118 					if (!opinion.getLabel().contains("gold")) {
119 						continue;
120 					}
121 
122 					if (opinion.getOpinionExpression().getSpan() == null) {
123 						continue;
124 					}
125 
126 					for (Term term : opinion.getOpinionExpression().getSpan().getTargets()) {
127 						opinionTerms.add(term);
128 					}
129 				}
130 
131 				for (Term term : document.getTerms()) {
132 					final LabelledVector.Builder builder = Vector.builder();
133 
134 					boolean inDataset = false;
135 
136 					if (!senticnetMM.get(term).isEmpty()) {
137 						builder.set("SENTIC", true);
138 						inDataset = true;
139 					}
140 					if (!subjectivityMM.get(term).isEmpty()) {
141 						builder.set("SUBJLEX", true);
142 						inDataset = true;
143 					}
144 
145 					if (!inDataset) {
146 						continue;
147 					}
148 
149 					builder.set("LEMMA." + term.getLemma(), true);
150 					builder.set("MORPHO." + term.getMorphofeat(), true);
151 					builder.set("POS." + term.getPos(), true);
152 
153 					int label = 0;
154 					if (opinionTerms.contains(term)) {
155 						label = 1;
156 					}
157 					LabelledVector vector = builder.build().label(label, null);
158 					trainingSet.add(vector);
159 				}
160 			}
161 
162 //			FrequencyHashSet<String> wordnets = new FrequencyHashSet<>();
163 //			for (String fileName : documents.keySet()) {
164 //				KAFDocument document = documents.get(fileName);
165 //				List<Term> terms = document.getTerms();
166 //				for (Term term : terms) {
167 //					if (!heads.get(document).contains(term)) {
168 //						continue;
169 //					}
170 //
171 //					List<ExternalRef> externalRefs = term.getExternalRefs();
172 //					for (ExternalRef externalRef : externalRefs) {
173 //						String resource = externalRef.getResource();
174 //						if (resource.equals("wn30-ukb")) {
175 //							String wn = externalRef.getReference();
176 //							wordnets.addAll(WordNet.getHyponyms(wn));
177 //							wordnets.addAll(WordNet.getHypernyms(wn));
178 //							wordnets.addAll(WordNet.getGenericSet(wn, net.didion.jwnl.data.PointerType.SIMILAR_TO));
179 //						}
180 //					}
181 //				}
182 //			}
183 //
184 //			SortedSet<Map.Entry<String, Integer>> wnSorted = wordnets.getSorted();
185 //			for (Map.Entry<String, Integer> entry : wnSorted) {
186 //				System.out.println(WordNet.getLemmas(entry.getKey()));
187 //				System.out.println(entry);
188 //				System.out.println();
189 //			}
190 
191 //			LOGGER.info("Feature analysis:\n{}", FeatureStats.toString(FeatureStats.forVectors(2, trainingSet, null).values()));
192 
193 			List<Classifier.Parameters> grid = parameters.grid(25, 10);
194 //			Classifier classifier = Classifier.train(grid, trainingSet, ConfusionMatrix.labelComparator(PrecisionRecall.Measure.F1, 1, true));
195 //			ConfusionMatrix crossValidate = Classifier.crossValidate(classifier.getParameters(), trainingSet, 3);
196 
197 //			ConfusionMatrix crossValidate = Classifier.crossValidate(parameters, trainingSet, 3);
198 //			LOGGER.info("\n" + crossValidate.toString());
199 
200 //				SVM mySVM = SVM.train(parameters, trainingSet);
201 //				List<LabelledVector> output = mySVM.predict(false, trainingSet);
202 //				ConfusionMatrix confusionMatrix = LabelledVector.evaluate(trainingSet, output, 2);
203 //				System.out.println(confusionMatrix);
204 
205 //				mySVM.writeTo();
206 
207 //			writer.close();
208 		} catch (final Throwable ex) {
209 			CommandLine.fail(ex);
210 		}
211 	}
212 }