1 package eu.fbk.dkm.pikes.resources.goodbadfor;
2
3 import eu.fbk.dkm.pikes.resources.mpqa.Record;
4 import eu.fbk.dkm.pikes.resources.mpqa.RecordSet;
5 import eu.fbk.utils.core.CommandLine;
6 import ixa.kaflib.KAFDocument;
7 import ixa.kaflib.Opinion;
8 import ixa.kaflib.Term;
9 import org.apache.commons.io.FileUtils;
10 import org.apache.commons.io.FilenameUtils;
11 import org.apache.commons.lang.StringEscapeUtils;
12 import org.slf4j.Logger;
13 import org.slf4j.LoggerFactory;
14
15 import javax.xml.stream.XMLStreamException;
16 import java.io.File;
17 import java.io.IOException;
18 import java.util.ArrayList;
19 import java.util.HashMap;
20 import java.util.Iterator;
21 import java.util.List;
22
23
24
25
26
27 public class CorpusAnnotator {
28
29 private static final Logger LOGGER = LoggerFactory.getLogger(CorpusAnnotator.class);
30
31 public static void main(final String[] args) throws IOException, XMLStreamException {
32 try {
33 final CommandLine cmd = CommandLine
34 .parser()
35 .withName("eu.fbk.dkm.pikes.resources.goodbadfor-annotator")
36 .withHeader("Annotated files with goodFor/badFor annotations")
37 .withOption("i", "input-path", "the base path of the corpus", "DIR", CommandLine.Type.DIRECTORY_EXISTING, true, false, true)
38 .withOption("t", "test", "test only on this file", "FILE", CommandLine.Type.STRING, true, false, false)
39 .withOption("f", "force", "Force opinion")
40 .withOption("s", "skip", "Skip writing files and show them")
41 .withLogger(LoggerFactory.getLogger("eu.fbk.fssa")).parse(args);
42
43 final File inputPath = cmd.getOptionValue("i", File.class);
44
45 File annotationsFolder = new File(inputPath.getAbsolutePath() + File.separator + "MPQA" + File.separator);
46 File nafFolder = new File(inputPath.getAbsolutePath() + File.separator + "NAF-parsed" + File.separator);
47
48
49 boolean forceOpinion = false;
50 if (cmd.hasOption("force")) {
51 forceOpinion = true;
52 }
53
54 boolean skip = false;
55 if (cmd.hasOption("skip")) {
56 skip = true;
57 }
58
59 String testFile = cmd.getOptionValue("t", String.class);
60
61 if (!annotationsFolder.exists()) {
62 LOGGER.error("Folder {} does not exist", annotationsFolder.getAbsolutePath());
63 }
64
65 if (!nafFolder.exists()) {
66 LOGGER.error("Folder {} does not exist", nafFolder.getAbsolutePath());
67 }
68
69
70
71
72
73 Iterator<File> fileIterator;
74 fileIterator = FileUtils.iterateFiles(nafFolder, new String[]{"naf"}, false);
75 while (fileIterator.hasNext()) {
76 File file = fileIterator.next();
77 String fileBaseName = FilenameUtils.removeExtension(file.getName());
78
79 if (testFile != null && !testFile.equals(fileBaseName)) {
80 continue;
81 }
82
83 File mpqaFile = new File(annotationsFolder.getAbsolutePath() + File.separator + fileBaseName + ".eu.fbk.dkm.pikes.resources.mpqa");
84
85
86
87
88 LOGGER.info(String.format("Loading file %s", mpqaFile));
89 if (!mpqaFile.exists()) {
90 LOGGER.warn("File {} does not exist", mpqaFile.getAbsolutePath());
91 continue;
92 }
93
94 String text = "";
95 LOGGER.info(String.format("Loading file %s", file));
96 KAFDocument document = KAFDocument.createFromFile(file);
97 text = document.getRawText();
98 text = StringEscapeUtils.unescapeHtml(text);
99 List<Term> terms = document.getTerms();
100
101
102 List<Opinion> opinions = document.getOpinions();
103 if (opinions.size() > 0 && !forceOpinion) {
104 LOGGER.info("Opinions already present, skipping...");
105 continue;
106 }
107
108 final RecordSet annotations = RecordSet.readFromFile(mpqaFile);
109
110 HashMap<String, Record> index = new HashMap<>();
111
112 for (Record record : annotations.getRecords()) {
113
114 String span1 = record.getSpan().apply(text);
115 String span2 = record.getValue("span");
116
117 if (span1 == null || span2 == null) {
118 continue;
119 }
120
121 span1 = StringEscapeUtils.unescapeHtml(span1);
122 span2 = StringEscapeUtils.unescapeHtml(span2);
123
124 String span1OnlyLetters = span1.replaceAll("[^0-9a-zA-Z]", "");
125 String span2OnlyLetters = span2.replaceAll("[^0-9a-zA-Z]", "");
126
127 if (!span1OnlyLetters.equals(span2OnlyLetters)) {
128 LOGGER.trace(span1);
129 LOGGER.trace(span2);
130 LOGGER.warn("The span is different, skipping");
131 continue;
132 }
133
134 String id = record.getValue("id");
135 if (id == null) {
136 LOGGER.warn("ID is null");
137 continue;
138 }
139
140
141
142
143
144
145 index.put(id, record);
146 }
147
148 for (Record record : annotations.getRecords()) {
149 String type = record.getName();
150 if (type == null) {
151 continue;
152 }
153 if (type.equals("gfbf") || type.equals("influencer")) {
154
155 String label = "gold-" + type;
156 String attribute = "polarity";
157 if (type.equals("influencer")) {
158 attribute = "effect";
159 }
160
161 LOGGER.debug(record.toString());
162
163 try {
164 Record agent = index.get(record.getValue("agent"));
165 Record target = index.get(record.getValue("object"));
166
167 List<Term> attitudeSpan = new ArrayList<>();
168 List<Term> targetSpan = new ArrayList<>();
169 List<Term> sourceSpan = new ArrayList<>();
170
171 attitudeSpan.addAll(eu.fbk.dkm.pikes.resources.mpqa.CorpusAnnotator.getSpan(terms, record.getSpan()));
172
173 Opinion opinion = document.newOpinion();
174 opinion.setLabel(label);
175
176 if (agent != null) {
177 sourceSpan.addAll(eu.fbk.dkm.pikes.resources.mpqa.CorpusAnnotator.getSpan(terms, agent.getSpan()));
178 if (sourceSpan.size() > 0) {
179 Opinion.OpinionHolder opinionHolder = opinion.createOpinionHolder(KAFDocument.newTermSpan(sourceSpan));
180 String attitude = agent.getValue("writerAttitude");
181 if (attitude != null) {
182 opinionHolder.setType(attitude);
183 }
184 }
185 }
186
187 if (target != null) {
188 targetSpan.addAll(eu.fbk.dkm.pikes.resources.mpqa.CorpusAnnotator.getSpan(terms, target.getSpan()));
189 if (targetSpan.size() > 0) {
190 Opinion.OpinionTarget opinionTarget = opinion.createOpinionTarget(KAFDocument.newTermSpan(targetSpan));
191 String attitude = target.getValue("writerAttitude");
192 if (attitude != null) {
193 opinionTarget.setType(attitude);
194 }
195 }
196 }
197
198
199 if (attitudeSpan.size() > 0) {
200 opinion.createOpinionExpression(KAFDocument.newTermSpan(attitudeSpan));
201 opinion.getOpinionExpression().setPolarity(record.getValue(attribute));
202 }
203
204 } catch (Exception e) {
205 LOGGER.warn(e.getMessage());
206 e.printStackTrace();
207 }
208
209
210 }
211 }
212
213 if (skip) {
214 System.out.println(document);
215 }
216 else {
217 document.save(file.getAbsolutePath());
218 }
219 }
220
221 } catch (final Throwable ex) {
222 CommandLine.fail(ex);
223 }
224 }
225
226
227 }