1   package eu.fbk.dkm.pikes.resources.mpqa;
2   
3   import eu.fbk.dkm.pikes.resources.NAFFilter;
4   import eu.fbk.utils.core.CommandLine;
5   import ixa.kaflib.KAFDocument;
6   import ixa.kaflib.Opinion;
7   import ixa.kaflib.Term;
8   import org.apache.commons.io.FileUtils;
9   import org.apache.commons.io.FilenameUtils;
10  import org.slf4j.Logger;
11  import org.slf4j.LoggerFactory;
12  
13  import java.io.BufferedReader;
14  import java.io.File;
15  import java.io.FileReader;
16  import java.io.IOException;
17  import java.util.*;
18  import java.util.regex.Matcher;
19  import java.util.regex.Pattern;
20  
21  /**
22   * Created by alessio on 20/03/15.
23   */
24  
25  public class CorpusAnnotator {
26  
27  	private static final Logger LOGGER = LoggerFactory.getLogger(CorpusAnnotator.class);
28  	public static final String DEFAULT_NAF_PARSED_DIR = "NAF-parsed";
29  
30  	static Pattern keyValuePatt = Pattern.compile("^([^=]+)=(.*)$");
31  	static Pattern spanPatt = Pattern.compile("^([^,]*),([^,]*)$");
32  	public static List<String> DEFAULT_NAF_EXTENSIONS = new ArrayList<>();
33  
34  	public static String GOLD_LABEL = "gold-eu.fbk.dkm.pikes.resources.mpqa";
35  
36  	static {
37  		DEFAULT_NAF_EXTENSIONS.add("xml");
38  		DEFAULT_NAF_EXTENSIONS.add("naf");
39  	}
40  
41  	public static List<Term> getSpan(List<Term> terms, Span interval) {
42  		if (interval == null) {
43  			return new ArrayList<Term>();
44  		}
45  
46  		int start = interval.begin;
47  		int end = interval.end - 1;
48  
49  		LOGGER.debug("Start: " + start + " - End: " + end);
50  		return getSpan(terms, start, end);
51  	}
52  
53  	public static List<Term> getSpan(List<Term> terms, String interval) {
54  		if (interval == null) {
55  			return new ArrayList<Term>();
56  		}
57  
58  		Matcher matcher = spanPatt.matcher(interval);
59  		if (!matcher.matches()) {
60  			return new ArrayList<Term>();
61  		}
62  
63  		int start = Integer.parseInt(matcher.group(1));
64  		int end = Integer.parseInt(matcher.group(2)) - 1;
65  
66  		LOGGER.debug("Start: " + start + " - End: " + end);
67  		return getSpan(terms, start, end);
68  	}
69  
70  	public static List<Term> getSpan(List<Term> terms, int start, int end) {
71  		List<Term> ret = new ArrayList<>();
72  
73  		for (Term t : terms) {
74  			int tStart = t.getOffset();
75  			int tEnd = t.getOffset() + t.getLength();
76  			if ((tEnd >= start && tEnd <= end) || (tStart >= start && tStart <= end)) {
77  				ret.add(t);
78  //				System.out.println(t);
79  //				System.out.println(t.getOffset());
80  //				System.out.println(t.getLength());
81  			}
82  		}
83  
84  		return ret;
85  	}
86  
87  	public static void main(String[] args) {
88  		CommandLine cmd = null;
89  		try {
90  			cmd = CommandLine
91  					.parser()
92  					.withName("eu.fbk.dkm.pikes.resources.mpqa-annotator")
93  					.withHeader("Annotated files with MPQA annotations")
94  					.withOption("i", "input-path", "the base path of the MPQA corpus", "DIR",
95  							CommandLine.Type.DIRECTORY_EXISTING, true, false, true)
96  					.withOption("o", "output",
97  							String.format("the output path where to load and save produced files, default [basedir]/%s", DEFAULT_NAF_PARSED_DIR),
98  							"DIR", CommandLine.Type.DIRECTORY_EXISTING, true, false, false)
99  					.withOption("a", "annotation",
100 							String.format("the annotation file, default [basedir]/%s", CorpusPreprocessor.DEFAULT_ANNOTATION_TSV),
101 							"FILE", CommandLine.Type.FILE_EXISTING, true, false, false)
102 					.withOption("e", "extensions", String.format("Input extensions (default %s)", DEFAULT_NAF_EXTENSIONS), "EXTS", CommandLine.Type.STRING, true, true, false)
103 					.withOption("t", "test", "test only on this file", "FILE", CommandLine.Type.STRING, true, false, false)
104 					.withOption("f", "force", "Force opinion")
105 					.withOption("F", "fake", "Fake mode, do not write to files")
106 					.withOption("s", "exclude-source-local-null", "Exclude opinion if source is null")
107 					.withLogger(LoggerFactory.getLogger("eu.fbk")).parse(args);
108 		} catch (Throwable ex) {
109 			CommandLine.fail(ex);
110 			System.exit(1);
111 		}
112 
113 		boolean forceOpinion = false;
114 		if (cmd.hasOption("force")) {
115 			forceOpinion = true;
116 		}
117 
118 		boolean fake = false;
119 		if (cmd.hasOption("fake")) {
120 			fake = true;
121 		}
122 
123 		boolean includeNullSources = true;
124 		if (cmd.hasOption("s")) {
125 			includeNullSources = false;
126 		}
127 
128 		File mainFolder = cmd.getOptionValue("i", File.class);
129 		String testFile = cmd.getOptionValue("t", String.class);
130 
131 		File input = new File(mainFolder.getAbsolutePath() + File.separator + DEFAULT_NAF_PARSED_DIR);
132 		if (cmd.hasOption("o")) {
133 			input = cmd.getOptionValue("o", File.class);
134 		}
135 
136 		File mpqa = new File(mainFolder.getAbsolutePath() + File.separator + CorpusPreprocessor.DEFAULT_ANNOTATION_TSV);
137 		if (cmd.hasOption("a")) {
138 			mpqa = cmd.getOptionValue("a", File.class);
139 		}
140 
141 		List<String> extensions = null;
142 		if (cmd.hasOption("e")) {
143 			extensions = cmd.getOptionValues("e", String.class);
144 		}
145 		if (extensions == null) {
146 			extensions = DEFAULT_NAF_EXTENSIONS;
147 		}
148 
149 		HashMap<String, HashSet<HashMap<String, String>>> opinionsByDocument = new HashMap<>();
150 
151 		try {
152 			if (!input.exists()) {
153 				throw new IOException(String.format("Folder %s does not exist", input.getAbsolutePath()));
154 			}
155 			if (!mpqa.exists()) {
156 				throw new IOException(String.format("File %s does not exist", input.getAbsolutePath()));
157 			}
158 
159 			LOGGER.info("Loading TSV file");
160 			BufferedReader reader = new BufferedReader(new FileReader(mpqa));
161 			String line;
162 			while ((line = reader.readLine()) != null) {
163 				String[] parts = line.split("\t");
164 				if (parts.length < 5) {
165 					continue;
166 				}
167 
168 				HashMap<String, String> properties = new HashMap<>();
169 				for (String s : parts) {
170 					Matcher matcher = keyValuePatt.matcher(s);
171 					if (matcher.matches()) {
172 						properties.put(matcher.group(1), matcher.group(2));
173 					}
174 				}
175 
176 				String document = properties.get("document");
177 				LOGGER.trace(document);
178 				if (document == null) {
179 					continue;
180 				}
181 
182 				if (!opinionsByDocument.containsKey(document)) {
183 					opinionsByDocument.put(document, new HashSet<HashMap<String, String>>());
184 				}
185 				opinionsByDocument.get(document).add(properties);
186 			}
187 			reader.close();
188 
189 			LOGGER.info("Loading file list");
190 			Iterator<File> fileIterator = FileUtils.iterateFiles(input, extensions.toArray(new String[extensions.size()]), true);
191 
192 			while (fileIterator.hasNext()) {
193 				File file = fileIterator.next();
194 
195 				String fileBaseName = FilenameUtils.removeExtension(file.getName());
196 				if (testFile != null && !testFile.equals(fileBaseName)) {
197 					continue;
198 				}
199 
200 				LOGGER.info(String.format("Loading file %s", file));
201 				KAFDocument document = KAFDocument.createFromFile(file);
202 
203 				// Check if there are already opinions
204 				List<Opinion> opinions = document.getOpinions();
205 				boolean hasGoldOpinions = false;
206 				for (Opinion opinion : opinions) {
207 					if (opinion.getLabel().equals(GOLD_LABEL)) {
208 						hasGoldOpinions = true;
209 						break;
210 					}
211 				}
212 
213 				if (hasGoldOpinions && !forceOpinion) {
214 					LOGGER.info("Gold opinions already present, skipping...");
215 				}
216 				else {
217 					List<Term> terms = document.getTerms();
218 
219 					String documentID = document.getPublic().uri;
220 					HashSet<HashMap<String, String>> map = opinionsByDocument.get(documentID);
221 					if (map == null) {
222 						continue;
223 					}
224 
225 					for (HashMap<String, String> properties : map) {
226 
227 						// Source
228 						List<Term> sourceSpan = new ArrayList<>();
229 						String sourceLocal = properties.get("source-local");
230 						if (sourceLocal == null && !includeNullSources) {
231 							LOGGER.trace("source-local is null");
232 							continue;
233 						}
234 						if (sourceLocal != null) {
235 							String[] parts = sourceLocal.split("\\|");
236 							for (String part : parts) {
237 								sourceSpan.addAll(getSpan(terms, part));
238 							}
239 						}
240 
241 						// Target
242 						List<Term> targetSpan = new ArrayList<>();
243 						targetSpan.addAll(getSpan(terms, properties.get("target")));
244 
245 						// Attitude
246 						List<Term> attitudeSpan = new ArrayList<>();
247 						attitudeSpan.addAll(getSpan(terms, properties.get("expression")));
248 
249 						Opinion opinion = document.newOpinion();
250 						opinion.setLabel(GOLD_LABEL + "-" + properties.get("type"));
251 						LOGGER.debug("Adding opinion {}", properties.get("sentence"));
252 
253 						if (sourceSpan.size() > 0) {
254 							opinion.createOpinionHolder(KAFDocument.newTermSpan(sourceSpan));
255 						}
256 						if (targetSpan.size() > 0) {
257 							opinion.createOpinionTarget(KAFDocument.newTermSpan(targetSpan));
258 						}
259 						if (attitudeSpan.size() > 0) {
260 							opinion.createOpinionExpression(KAFDocument.newTermSpan(attitudeSpan));
261 							opinion.getOpinionExpression().setPolarity(properties.get("sentiment"));
262 							opinion.getOpinionExpression().setStrength(properties.get("intensity"));
263 						}
264 					}
265 				}
266 
267 				NAFFilter.builder(false).withSRLRoleLinking(true, true)
268 						.withOpinionLinking(true, true).build().filter(document);
269 
270 				if (!fake) {
271 					document.save(file.getAbsolutePath());
272 				}
273 			}
274 		} catch (Exception e) {
275 			LOGGER.error(e.getMessage());
276 			e.printStackTrace();
277 		}
278 
279 	}
280 }