1 package eu.fbk.dkm.pikes.resources.darmstadt;
2
3 import com.google.common.base.Charsets;
4 import com.google.common.io.Files;
5 import eu.fbk.dkm.pikes.resources.NAFFilter;
6 import eu.fbk.utils.core.CommandLine;
7 import ixa.kaflib.KAFDocument;
8 import ixa.kaflib.Opinion;
9 import ixa.kaflib.Span;
10 import ixa.kaflib.Term;
11 import org.apache.commons.io.FileUtils;
12 import org.apache.commons.io.FilenameUtils;
13 import org.slf4j.LoggerFactory;
14 import org.w3c.dom.Document;
15 import org.w3c.dom.Element;
16 import org.w3c.dom.NamedNodeMap;
17 import org.w3c.dom.NodeList;
18
19 import javax.xml.parsers.DocumentBuilder;
20 import javax.xml.parsers.DocumentBuilderFactory;
21 import java.io.ByteArrayInputStream;
22 import java.io.File;
23 import java.util.HashMap;
24 import java.util.HashSet;
25 import java.util.Iterator;
26 import java.util.regex.Matcher;
27 import java.util.regex.Pattern;
28
29
30
31
32
33 public class CorpusAnnotator {
34
35 private static final org.slf4j.Logger LOGGER = LoggerFactory.getLogger(CorpusAnnotator.class);
36 private static Pattern spanPattern = Pattern.compile("word_([0-9]+)");
37 private static String TERM_PREFIX = "t";
38
39 private static void getFilesRecursive(File pFile, HashSet<String> folders) {
40 for (File file : pFile.listFiles()) {
41 if (file.isDirectory()) {
42 folders.add(file.getAbsolutePath());
43 getFilesRecursive(file, folders);
44 }
45 }
46 }
47
48 private static Integer getTermFromSpan(String span) {
49 Matcher matcher = spanPattern.matcher(span);
50 if (matcher.find()) {
51 Integer id = Integer.parseInt(matcher.group(1));
52 return id - 1;
53 }
54
55 return null;
56 }
57
58 private static Span<Term> getTermsFromSpan(KAFDocument document, String span) {
59 String[] parts = span.split("[^a-z0-9A-Z_]+");
60 Span<Term> termSpan = KAFDocument.newTermSpan();
61
62 if (parts.length == 1) {
63 Integer id = getTermFromSpan(parts[0]);
64 termSpan.addTarget(document.getTerms().get(id));
65 }
66 else if (parts.length > 1) {
67 Integer id1 = getTermFromSpan(parts[0]);
68 Integer id2 = getTermFromSpan(parts[parts.length - 1]);
69 for (int i = id1; i <= id2; i++) {
70 termSpan.addTarget(document.getTerms().get(i));
71 }
72 }
73
74 return termSpan;
75 }
76
77 public static void main(String[] args) {
78 try {
79 final CommandLine cmd = CommandLine
80 .parser()
81 .withName("eu.fbk.dkm.pikes.resources.darmstadt-loader")
82 .withHeader("Load eu.fbk.dkm.pikes.resources.darmstadt-service-review-corpus")
83 .withOption("i", "input-folder", "the folder of the corpus", "DIR", CommandLine.Type.DIRECTORY_EXISTING, true, false, true)
84 .withOption("f", "force", "Force opinion")
85 .withLogger(LoggerFactory.getLogger("eu.fbk.fssa")).parse(args);
86
87 final File inputFile = cmd.getOptionValue("i", File.class);
88 boolean forceOpinion = cmd.hasOption("f");
89
90 DocumentBuilderFactory dbFactory = DocumentBuilderFactory.newInstance();
91 dbFactory.setValidating(false);
92 dbFactory.setNamespaceAware(true);
93 dbFactory.setFeature("http://xml.org/sax/features/namespaces", false);
94 dbFactory.setFeature("http://xml.org/sax/features/validation", false);
95 dbFactory.setFeature("http://apache.org/xml/features/nonvalidating/load-dtd-grammar", false);
96 dbFactory.setFeature("http://apache.org/xml/features/nonvalidating/load-external-dtd", false);
97 DocumentBuilder dBuilder;
98 Document doc;
99
100 HashSet<String> folders = new HashSet<>();
101 getFilesRecursive(inputFile, folders);
102
103 HashSet<String> okFolders = new HashSet<>();
104 okLoop:
105 for (String folder : folders) {
106 for (String pattern : CorpusLoader.MMAX_PATTERN) {
107 StringBuffer newFolder = new StringBuffer();
108 newFolder.append(folder);
109 newFolder.append(File.separator);
110 newFolder.append(pattern);
111
112 if (!folders.contains(newFolder.toString())) {
113 continue okLoop;
114 }
115 }
116
117 okFolders.add(folder);
118 }
119
120 for (String folder : okFolders) {
121 LOGGER.info("Entering folder {}", folder);
122
123 String markableDir = folder + File.separator + CorpusLoader.MMAX_PATTERN[1];
124 String basedataDir = folder + File.separator + CorpusLoader.MMAX_PATTERN[0];
125 File nafDir = new File(folder + File.separator + "naf-parsed");
126
127 Iterator<File> fileIterator;
128 fileIterator = FileUtils.iterateFiles(nafDir, new String[]{"naf"}, false);
129 while (fileIterator.hasNext()) {
130 File file = fileIterator.next();
131 String fileBaseName = FilenameUtils.removeExtension(file.getName());
132 LOGGER.info(fileBaseName);
133
134 File annotatedFile = new File(markableDir + File.separator + fileBaseName + CorpusLoader.MMAX_SUFFIXES[1] + ".xml");
135 if (!annotatedFile.exists()) {
136 LOGGER.warn("File {} does not exist", annotatedFile.getAbsolutePath());
137 continue;
138 }
139
140 File basedataFile = new File(basedataDir + File.separator + fileBaseName + CorpusLoader.MMAX_SUFFIXES[0] + ".xml");
141 if (!basedataFile.exists()) {
142 LOGGER.warn("File {} does not exist", basedataFile.getAbsolutePath());
143 continue;
144 }
145
146 KAFDocument document = KAFDocument.createFromFile(file);
147
148 boolean hasGoldOpinions = false;
149 for (Opinion opinion : document.getOpinions()) {
150 if ("gold-eu.fbk.dkm.pikes.resources.darmstadt".equals(opinion.getLabel())) {
151 hasGoldOpinions = true;
152 break;
153 }
154 }
155
156 if (hasGoldOpinions && !forceOpinion) {
157 LOGGER.info("Opinions already present, skipping...");
158
159 } else {
160 String fileContent;
161 fileContent = Files.toString(basedataFile, Charsets.UTF_8);
162 fileContent = fileContent.replaceAll("&", "&");
163 dBuilder = dbFactory.newDocumentBuilder();
164 doc = dBuilder.parse(new ByteArrayInputStream(fileContent.getBytes(Charsets.UTF_8)));
165
166 int origWordCount = doc.getElementsByTagName("word").getLength();
167 int nafWordCount = document.getWFs().size();
168
169 if (origWordCount != nafWordCount) {
170 LOGGER.warn("Word counts differ ({}/{})", origWordCount, nafWordCount);
171 }
172
173 HashMap<String, HashMap<String, String>> markables = new HashMap<>();
174
175 fileContent = Files.toString(annotatedFile, Charsets.UTF_8);
176 dBuilder = dbFactory.newDocumentBuilder();
177 doc = dBuilder.parse(new ByteArrayInputStream(fileContent.getBytes(Charsets.UTF_8)));
178 NodeList nList = doc.getElementsByTagName("markable");
179 for (int temp = 0; temp < nList.getLength(); temp++) {
180 Element nNode = (Element) nList.item(temp);
181 NamedNodeMap attributes = nNode.getAttributes();
182 if (attributes != null) {
183 HashMap<String, String> thisMarkable = new HashMap<>();
184 for (int i = 0; i < attributes.getLength(); i++) {
185 thisMarkable.put(attributes.item(i).getNodeName(), attributes.item(i).getNodeValue());
186 }
187
188 if (thisMarkable.get("id") != null) {
189 markables.put(thisMarkable.get("id"), thisMarkable);
190 }
191 }
192 }
193
194 for (HashMap<String, String> markable : markables.values()) {
195 if (markable.get("annotation_type").equals("opinionexpression")) {
196
197 String holderString = markable.get("opinionholder");
198 String targetString = markable.get("opiniontarget");
199
200 HashMap<String, String> holder = null;
201 HashMap<String, String> target = null;
202
203 if (holderString != null && !holderString.equals("empty")) {
204 holder = markables.get(holderString);
205 }
206 if (targetString != null && !targetString.equals("empty")) {
207 target = markables.get(targetString);
208 }
209
210 Span<Term> termSpan;
211
212 try {
213 termSpan = getTermsFromSpan(document, markable.get("span"));
214 } catch (Exception e) {
215 continue;
216 }
217
218 Opinion opinion = document.createOpinion();
219 opinion.setLabel("gold-eu.fbk.dkm.pikes.resources.darmstadt");
220 Opinion.OpinionExpression expression = opinion.createOpinionExpression(termSpan);
221 if (markable.get("polarity") != null) {
222 expression.setPolarity(markable.get("polarity"));
223 }
224 if (markable.get("strength") != null) {
225 expression.setStrength(markable.get("strength"));
226 }
227
228 if (holder != null) {
229 Span<Term> terms = getTermsFromSpan(document, holder.get("span"));
230 opinion.createOpinionHolder(terms);
231 }
232 if (target != null) {
233 Span<Term> terms = getTermsFromSpan(document, target.get("span"));
234 opinion.createOpinionTarget(terms);
235 }
236 }
237 }
238 }
239
240 NAFFilter.builder(false).withSRLRoleLinking(true, true)
241 .withOpinionLinking(true, true).build().filter(document);
242
243 document.save(file.getAbsolutePath());
244 }
245 }
246
247 } catch (final Throwable ex) {
248 CommandLine.fail(ex);
249 }
250 }
251
252 }