1 package eu.fbk.dkm.pikes.resources.darmstadt;
2
3 import java.io.File;
4 import java.nio.file.Files;
5 import java.util.ArrayList;
6 import java.util.HashMap;
7 import java.util.List;
8 import java.util.Set;
9
10 import javax.xml.parsers.DocumentBuilder;
11 import javax.xml.parsers.DocumentBuilderFactory;
12 import javax.xml.transform.OutputKeys;
13 import javax.xml.transform.Transformer;
14 import javax.xml.transform.TransformerFactory;
15 import javax.xml.transform.dom.DOMSource;
16 import javax.xml.transform.stream.StreamResult;
17
18 import com.google.common.collect.Sets;
19
20 import eu.fbk.rdfpro.util.Statements;
21 import org.eclipse.rdf4j.model.IRI;
22 import org.eclipse.rdf4j.model.impl.URIImpl;
23 import org.slf4j.LoggerFactory;
24 import org.w3c.dom.Document;
25 import org.w3c.dom.Element;
26
27 import ixa.kaflib.KAFDocument;
28 import ixa.kaflib.Opinion;
29
30 import eu.fbk.dkm.pikes.naflib.Corpus;
31 import eu.fbk.utils.core.CommandLine;
32
33
34
35
36
37 public class ConvertNafDocumentsToXml {
38
39 private static final org.slf4j.Logger LOGGER = LoggerFactory.getLogger(ConvertNafDocumentsToXml.class);
40
41 public static void main(String[] args) {
42 try {
43 final CommandLine cmd = CommandLine
44 .parser()
45 .withName("yamcha-extractor")
46 .withHeader("Check ESWC dataset with Darmstadt")
47 .withOption("i", "input-folder", "the folder of the NAF corpus", "DIR", CommandLine.Type.DIRECTORY_EXISTING, true, false, true)
48 .withOption("o", "output-file", "output file", "FILE", CommandLine.Type.FILE, true, false, true)
49 .withOption("l", "label", "opinion label", "STRING", CommandLine.Type.STRING, true, false, true)
50 .withOption("n", "numeric", "use numeric values for IDs")
51 .withOption(null, "list", "use list of file to sort", "FILE", CommandLine.Type.FILE_EXISTING, true, false, false)
52 .withLogger(LoggerFactory.getLogger("eu.fbk")).parse(args);
53
54 File inputFolder = cmd.getOptionValue("input-folder", File.class);
55 File outputFile = cmd.getOptionValue("output-file", File.class);
56 Set<String> labels = Sets.newHashSet(cmd.getOptionValue("label", String.class, "").split(","));
57
58 File list = cmd.getOptionValue("list", File.class);
59
60 boolean useNumeric = cmd.hasOption("numeric");
61
62 DocumentBuilderFactory docFactory = DocumentBuilderFactory.newInstance();
63 DocumentBuilder docBuilder = docFactory.newDocumentBuilder();
64
65 Document doc = docBuilder.newDocument();
66 Element rootElement = doc.createElement("Sentences");
67 doc.appendChild(rootElement);
68
69 int id = 0;
70
71 Iterable<KAFDocument> corpus = Corpus.create(false, inputFolder);
72
73 if (list != null) {
74 LOGGER.info("Load file list from {}", list.getAbsolutePath());
75 ArrayList<KAFDocument> files = new ArrayList<>();
76 List<String> fileList = Files.readAllLines(list.toPath());
77 for (String fileName : fileList) {
78 fileName = fileName.trim();
79 if (fileName.length() == 0) {
80 continue;
81 }
82 String documentFileName = inputFolder + File.separator + fileName;
83 files.add(KAFDocument.createFromFile(new File(documentFileName)));
84 }
85 corpus = files;
86 }
87
88 int fileNum = 0;
89 for (KAFDocument document : corpus) {
90 fileNum++;
91 LOGGER.info("File {}", document.getPublic().uri);
92 Element sentenceElement = doc.createElement("sentence");
93
94 if (useNumeric) {
95 sentenceElement.setAttribute("id", "" + id++);
96 }
97 else {
98 IRI uri = Statements.VALUE_FACTORY.createIRI(document.getPublic().uri);
99 sentenceElement.setAttribute("id", uri.getLocalName());
100 }
101
102 rootElement.appendChild(sentenceElement);
103 Element textElement = doc.createElement("text");
104 textElement.appendChild(doc.createTextNode(document.getRawText()));
105 sentenceElement.appendChild(textElement);
106
107 for (Opinion opinion : document.getOpinions()) {
108
109 boolean matches = false;
110 for (String l : labels) {
111 if (opinion.getLabel().contains(l)) {
112 matches = true;
113 break;
114 }
115 }
116 if (!matches) {
117 continue;
118 }
119
120 String expression = null;
121 if (opinion.getOpinionExpression() == null) {
122 continue;
123 }
124
125 HashMap<String, Integer> indexes = new HashMap<>();
126 indexes.put("holder-start", -1);
127 indexes.put("holder-end", -1);
128 indexes.put("target-start", -1);
129 indexes.put("target-end", -1);
130
131 expression = opinion.getExpressionSpan().getStr();
132 indexes.put("expression-start", opinion.getExpressionSpan().getTargets().get(0).getOffset());
133 indexes.put("expression-end", opinion.getExpressionSpan().getTargets().get(opinion.getExpressionSpan().getTargets().size() - 1).getOffset() +
134 opinion.getExpressionSpan().getTargets().get(opinion.getExpressionSpan().getTargets().size() - 1).getLength());
135
136 String holder = null;
137 if (opinion.getOpinionHolder() != null && !opinion.getOpinionHolder().getTerms().isEmpty()) {
138 holder = opinion.getHolderSpan().getStr();
139 indexes.put("holder-start", opinion.getHolderSpan().getTargets().get(0).getOffset());
140 indexes.put("holder-end", opinion.getHolderSpan().getTargets().get(opinion.getHolderSpan().getTargets().size() - 1).getOffset() +
141 opinion.getHolderSpan().getTargets().get(opinion.getHolderSpan().getTargets().size() - 1).getLength());
142 }
143 else {
144 holder = "null";
145 }
146
147 String target = null;
148 if (opinion.getOpinionTarget() != null && !opinion.getOpinionTarget().getTerms().isEmpty()) {
149 target = opinion.getTargetSpan().getStr();
150 indexes.put("target-start", opinion.getTargetSpan().getTargets().get(0).getOffset());
151 indexes.put("target-end", opinion.getTargetSpan().getTargets().get(opinion.getTargetSpan().getTargets().size() - 1).getOffset() +
152 opinion.getTargetSpan().getTargets().get(opinion.getTargetSpan().getTargets().size() - 1).getLength());
153 }
154 else {
155 target = "null";
156 }
157
158 Element frameElement = doc.createElement("frame");
159
160 Element holderElement = doc.createElement("holder");
161 holderElement.setAttribute("value", holder);
162 holderElement.setAttribute("start", Integer.toString(indexes.get("holder-start")));
163 holderElement.setAttribute("end", Integer.toString(indexes.get("holder-end")));
164
165 Element topicElement = doc.createElement("topic");
166 topicElement.setAttribute("value", target);
167 topicElement.setAttribute("start", Integer.toString(indexes.get("target-start")));
168 topicElement.setAttribute("end", Integer.toString(indexes.get("target-end")));
169
170 Element opinionElement = doc.createElement("opinion");
171 opinionElement.setAttribute("value", expression);
172 opinionElement.setAttribute("start", Integer.toString(indexes.get("expression-start")));
173 opinionElement.setAttribute("end", Integer.toString(indexes.get("expression-end")));
174 Element polarityElement = doc.createElement("polarity");
175 polarityElement.appendChild(doc.createTextNode(opinion.getPolarity() != null ? normalizePolarity(opinion.getPolarity()) : "neutral"));
176 opinionElement.appendChild(polarityElement);
177
178 frameElement.appendChild(holderElement);
179 frameElement.appendChild(topicElement);
180 frameElement.appendChild(opinionElement);
181 sentenceElement.appendChild(frameElement);
182 }
183 }
184
185 LOGGER.info("Read {} files", fileNum);
186
187 TransformerFactory transformerFactory = TransformerFactory.newInstance();
188 Transformer transformer = transformerFactory.newTransformer();
189 transformer.setOutputProperty(OutputKeys.INDENT, "yes");
190 transformer.setOutputProperty("{http://xml.apache.org/xslt}indent-amount", "4");
191 transformer.setOutputProperty(OutputKeys.ENCODING, "UTF-8");
192
193 DOMSource source = new DOMSource(doc);
194
195 StreamResult result = new StreamResult(outputFile);
196
197
198 transformer.transform(source, result);
199
200 } catch (final Throwable ex) {
201 CommandLine.fail(ex);
202 }
203 }
204
205 private static String normalizePolarity( String polarity) {
206 String p = polarity.toLowerCase();
207 if (p.contains("pos")) {
208 return "positive";
209 } else if (p.contains("neg")) {
210 return "negative";
211 } else {
212 return "neutral";
213 }
214 }
215
216 }