1 package eu.fbk.dkm.pikes.resources.mpqa;
2
3 import eu.fbk.dkm.pikes.resources.NAFFilter;
4 import eu.fbk.utils.core.CommandLine;
5 import ixa.kaflib.KAFDocument;
6 import ixa.kaflib.Opinion;
7 import ixa.kaflib.Term;
8 import org.apache.commons.io.FileUtils;
9 import org.apache.commons.io.FilenameUtils;
10 import org.slf4j.Logger;
11 import org.slf4j.LoggerFactory;
12
13 import java.io.BufferedReader;
14 import java.io.File;
15 import java.io.FileReader;
16 import java.io.IOException;
17 import java.util.*;
18 import java.util.regex.Matcher;
19 import java.util.regex.Pattern;
20
21
22
23
24
25 public class CorpusAnnotator {
26
27 private static final Logger LOGGER = LoggerFactory.getLogger(CorpusAnnotator.class);
28 public static final String DEFAULT_NAF_PARSED_DIR = "NAF-parsed";
29
30 static Pattern keyValuePatt = Pattern.compile("^([^=]+)=(.*)$");
31 static Pattern spanPatt = Pattern.compile("^([^,]*),([^,]*)$");
32 public static List<String> DEFAULT_NAF_EXTENSIONS = new ArrayList<>();
33
34 public static String GOLD_LABEL = "gold-eu.fbk.dkm.pikes.resources.mpqa";
35
36 static {
37 DEFAULT_NAF_EXTENSIONS.add("xml");
38 DEFAULT_NAF_EXTENSIONS.add("naf");
39 }
40
41 public static List<Term> getSpan(List<Term> terms, Span interval) {
42 if (interval == null) {
43 return new ArrayList<Term>();
44 }
45
46 int start = interval.begin;
47 int end = interval.end - 1;
48
49 LOGGER.debug("Start: " + start + " - End: " + end);
50 return getSpan(terms, start, end);
51 }
52
53 public static List<Term> getSpan(List<Term> terms, String interval) {
54 if (interval == null) {
55 return new ArrayList<Term>();
56 }
57
58 Matcher matcher = spanPatt.matcher(interval);
59 if (!matcher.matches()) {
60 return new ArrayList<Term>();
61 }
62
63 int start = Integer.parseInt(matcher.group(1));
64 int end = Integer.parseInt(matcher.group(2)) - 1;
65
66 LOGGER.debug("Start: " + start + " - End: " + end);
67 return getSpan(terms, start, end);
68 }
69
70 public static List<Term> getSpan(List<Term> terms, int start, int end) {
71 List<Term> ret = new ArrayList<>();
72
73 for (Term t : terms) {
74 int tStart = t.getOffset();
75 int tEnd = t.getOffset() + t.getLength();
76 if ((tEnd >= start && tEnd <= end) || (tStart >= start && tStart <= end)) {
77 ret.add(t);
78
79
80
81 }
82 }
83
84 return ret;
85 }
86
87 public static void main(String[] args) {
88 CommandLine cmd = null;
89 try {
90 cmd = CommandLine
91 .parser()
92 .withName("eu.fbk.dkm.pikes.resources.mpqa-annotator")
93 .withHeader("Annotated files with MPQA annotations")
94 .withOption("i", "input-path", "the base path of the MPQA corpus", "DIR",
95 CommandLine.Type.DIRECTORY_EXISTING, true, false, true)
96 .withOption("o", "output",
97 String.format("the output path where to load and save produced files, default [basedir]/%s", DEFAULT_NAF_PARSED_DIR),
98 "DIR", CommandLine.Type.DIRECTORY_EXISTING, true, false, false)
99 .withOption("a", "annotation",
100 String.format("the annotation file, default [basedir]/%s", CorpusPreprocessor.DEFAULT_ANNOTATION_TSV),
101 "FILE", CommandLine.Type.FILE_EXISTING, true, false, false)
102 .withOption("e", "extensions", String.format("Input extensions (default %s)", DEFAULT_NAF_EXTENSIONS), "EXTS", CommandLine.Type.STRING, true, true, false)
103 .withOption("t", "test", "test only on this file", "FILE", CommandLine.Type.STRING, true, false, false)
104 .withOption("f", "force", "Force opinion")
105 .withOption("F", "fake", "Fake mode, do not write to files")
106 .withOption("s", "exclude-source-local-null", "Exclude opinion if source is null")
107 .withLogger(LoggerFactory.getLogger("eu.fbk")).parse(args);
108 } catch (Throwable ex) {
109 CommandLine.fail(ex);
110 System.exit(1);
111 }
112
113 boolean forceOpinion = false;
114 if (cmd.hasOption("force")) {
115 forceOpinion = true;
116 }
117
118 boolean fake = false;
119 if (cmd.hasOption("fake")) {
120 fake = true;
121 }
122
123 boolean includeNullSources = true;
124 if (cmd.hasOption("s")) {
125 includeNullSources = false;
126 }
127
128 File mainFolder = cmd.getOptionValue("i", File.class);
129 String testFile = cmd.getOptionValue("t", String.class);
130
131 File input = new File(mainFolder.getAbsolutePath() + File.separator + DEFAULT_NAF_PARSED_DIR);
132 if (cmd.hasOption("o")) {
133 input = cmd.getOptionValue("o", File.class);
134 }
135
136 File mpqa = new File(mainFolder.getAbsolutePath() + File.separator + CorpusPreprocessor.DEFAULT_ANNOTATION_TSV);
137 if (cmd.hasOption("a")) {
138 mpqa = cmd.getOptionValue("a", File.class);
139 }
140
141 List<String> extensions = null;
142 if (cmd.hasOption("e")) {
143 extensions = cmd.getOptionValues("e", String.class);
144 }
145 if (extensions == null) {
146 extensions = DEFAULT_NAF_EXTENSIONS;
147 }
148
149 HashMap<String, HashSet<HashMap<String, String>>> opinionsByDocument = new HashMap<>();
150
151 try {
152 if (!input.exists()) {
153 throw new IOException(String.format("Folder %s does not exist", input.getAbsolutePath()));
154 }
155 if (!mpqa.exists()) {
156 throw new IOException(String.format("File %s does not exist", input.getAbsolutePath()));
157 }
158
159 LOGGER.info("Loading TSV file");
160 BufferedReader reader = new BufferedReader(new FileReader(mpqa));
161 String line;
162 while ((line = reader.readLine()) != null) {
163 String[] parts = line.split("\t");
164 if (parts.length < 5) {
165 continue;
166 }
167
168 HashMap<String, String> properties = new HashMap<>();
169 for (String s : parts) {
170 Matcher matcher = keyValuePatt.matcher(s);
171 if (matcher.matches()) {
172 properties.put(matcher.group(1), matcher.group(2));
173 }
174 }
175
176 String document = properties.get("document");
177 LOGGER.trace(document);
178 if (document == null) {
179 continue;
180 }
181
182 if (!opinionsByDocument.containsKey(document)) {
183 opinionsByDocument.put(document, new HashSet<HashMap<String, String>>());
184 }
185 opinionsByDocument.get(document).add(properties);
186 }
187 reader.close();
188
189 LOGGER.info("Loading file list");
190 Iterator<File> fileIterator = FileUtils.iterateFiles(input, extensions.toArray(new String[extensions.size()]), true);
191
192 while (fileIterator.hasNext()) {
193 File file = fileIterator.next();
194
195 String fileBaseName = FilenameUtils.removeExtension(file.getName());
196 if (testFile != null && !testFile.equals(fileBaseName)) {
197 continue;
198 }
199
200 LOGGER.info(String.format("Loading file %s", file));
201 KAFDocument document = KAFDocument.createFromFile(file);
202
203
204 List<Opinion> opinions = document.getOpinions();
205 boolean hasGoldOpinions = false;
206 for (Opinion opinion : opinions) {
207 if (opinion.getLabel().equals(GOLD_LABEL)) {
208 hasGoldOpinions = true;
209 break;
210 }
211 }
212
213 if (hasGoldOpinions && !forceOpinion) {
214 LOGGER.info("Gold opinions already present, skipping...");
215 }
216 else {
217 List<Term> terms = document.getTerms();
218
219 String documentID = document.getPublic().uri;
220 HashSet<HashMap<String, String>> map = opinionsByDocument.get(documentID);
221 if (map == null) {
222 continue;
223 }
224
225 for (HashMap<String, String> properties : map) {
226
227
228 List<Term> sourceSpan = new ArrayList<>();
229 String sourceLocal = properties.get("source-local");
230 if (sourceLocal == null && !includeNullSources) {
231 LOGGER.trace("source-local is null");
232 continue;
233 }
234 if (sourceLocal != null) {
235 String[] parts = sourceLocal.split("\\|");
236 for (String part : parts) {
237 sourceSpan.addAll(getSpan(terms, part));
238 }
239 }
240
241
242 List<Term> targetSpan = new ArrayList<>();
243 targetSpan.addAll(getSpan(terms, properties.get("target")));
244
245
246 List<Term> attitudeSpan = new ArrayList<>();
247 attitudeSpan.addAll(getSpan(terms, properties.get("expression")));
248
249 Opinion opinion = document.newOpinion();
250 opinion.setLabel(GOLD_LABEL + "-" + properties.get("type"));
251 LOGGER.debug("Adding opinion {}", properties.get("sentence"));
252
253 if (sourceSpan.size() > 0) {
254 opinion.createOpinionHolder(KAFDocument.newTermSpan(sourceSpan));
255 }
256 if (targetSpan.size() > 0) {
257 opinion.createOpinionTarget(KAFDocument.newTermSpan(targetSpan));
258 }
259 if (attitudeSpan.size() > 0) {
260 opinion.createOpinionExpression(KAFDocument.newTermSpan(attitudeSpan));
261 opinion.getOpinionExpression().setPolarity(properties.get("sentiment"));
262 opinion.getOpinionExpression().setStrength(properties.get("intensity"));
263 }
264 }
265 }
266
267 NAFFilter.builder(false).withSRLRoleLinking(true, true)
268 .withOpinionLinking(true, true).build().filter(document);
269
270 if (!fake) {
271 document.save(file.getAbsolutePath());
272 }
273 }
274 } catch (Exception e) {
275 LOGGER.error(e.getMessage());
276 e.printStackTrace();
277 }
278
279 }
280 }