1   package eu.fbk.dkm.pikes.resources.darmstadt;
2   
3   import java.io.File;
4   import java.nio.file.Files;
5   import java.util.ArrayList;
6   import java.util.HashMap;
7   import java.util.List;
8   import java.util.Set;
9   
10  import javax.xml.parsers.DocumentBuilder;
11  import javax.xml.parsers.DocumentBuilderFactory;
12  import javax.xml.transform.OutputKeys;
13  import javax.xml.transform.Transformer;
14  import javax.xml.transform.TransformerFactory;
15  import javax.xml.transform.dom.DOMSource;
16  import javax.xml.transform.stream.StreamResult;
17  
18  import com.google.common.collect.Sets;
19  
20  import eu.fbk.rdfpro.util.Statements;
21  import org.eclipse.rdf4j.model.IRI;
22  import org.eclipse.rdf4j.model.impl.URIImpl;
23  import org.slf4j.LoggerFactory;
24  import org.w3c.dom.Document;
25  import org.w3c.dom.Element;
26  
27  import ixa.kaflib.KAFDocument;
28  import ixa.kaflib.Opinion;
29  
30  import eu.fbk.dkm.pikes.naflib.Corpus;
31  import eu.fbk.utils.core.CommandLine;
32  
33  /**
34   * Created by alessio on 26/05/15.
35   */
36  
37  public class ConvertNafDocumentsToXml {
38  
39  	private static final org.slf4j.Logger LOGGER = LoggerFactory.getLogger(ConvertNafDocumentsToXml.class);
40  
41  	public static void main(String[] args) {
42  		try {
43  			final CommandLine cmd = CommandLine
44  					.parser()
45  					.withName("yamcha-extractor")
46  					.withHeader("Check ESWC dataset with Darmstadt")
47  					.withOption("i", "input-folder", "the folder of the NAF corpus", "DIR", CommandLine.Type.DIRECTORY_EXISTING, true, false, true)
48  					.withOption("o", "output-file", "output file", "FILE", CommandLine.Type.FILE, true, false, true)
49  					.withOption("l", "label", "opinion label", "STRING", CommandLine.Type.STRING, true, false, true)
50  					.withOption("n", "numeric", "use numeric values for IDs")
51  					.withOption(null, "list", "use list of file to sort", "FILE", CommandLine.Type.FILE_EXISTING, true, false, false)
52  					.withLogger(LoggerFactory.getLogger("eu.fbk")).parse(args);
53  
54  			File inputFolder = cmd.getOptionValue("input-folder", File.class);
55  			File outputFile = cmd.getOptionValue("output-file", File.class);
56  			Set<String> labels = Sets.newHashSet(cmd.getOptionValue("label", String.class, "").split(","));
57  			
58  			File list = cmd.getOptionValue("list", File.class);
59  
60  			boolean useNumeric = cmd.hasOption("numeric");
61  
62  			DocumentBuilderFactory docFactory = DocumentBuilderFactory.newInstance();
63  			DocumentBuilder docBuilder = docFactory.newDocumentBuilder();
64  
65  			Document doc = docBuilder.newDocument();
66  			Element rootElement = doc.createElement("Sentences");
67  			doc.appendChild(rootElement);
68  
69  			int id = 0;
70  
71  			Iterable<KAFDocument> corpus = Corpus.create(false, inputFolder);
72  
73  			if (list != null) {
74  				LOGGER.info("Load file list from {}", list.getAbsolutePath());
75  				ArrayList<KAFDocument> files = new ArrayList<>();
76  				List<String> fileList = Files.readAllLines(list.toPath());
77  				for (String fileName : fileList) {
78  					fileName = fileName.trim();
79  					if (fileName.length() == 0) {
80  						continue;
81  					}
82  					String documentFileName = inputFolder + File.separator + fileName;
83  					files.add(KAFDocument.createFromFile(new File(documentFileName)));
84  				}
85  				corpus = files;
86  			}
87  
88  			int fileNum = 0;
89  			for (KAFDocument document : corpus) {
90  				fileNum++;
91  				LOGGER.info("File {}", document.getPublic().uri);
92  				Element sentenceElement = doc.createElement("sentence");
93  
94  				if (useNumeric) {
95  					sentenceElement.setAttribute("id", "" + id++);
96  				}
97  				else {
98  					IRI uri = Statements.VALUE_FACTORY.createIRI(document.getPublic().uri);
99  					sentenceElement.setAttribute("id", uri.getLocalName());
100 				}
101 
102 				rootElement.appendChild(sentenceElement);
103 				Element textElement = doc.createElement("text");
104 				textElement.appendChild(doc.createTextNode(document.getRawText()));
105 				sentenceElement.appendChild(textElement);
106 
107 				for (Opinion opinion : document.getOpinions()) {
108 
109 				    boolean matches = false;
110 				    for (String l : labels) {
111 				        if (opinion.getLabel().contains(l)) {
112 				            matches = true;
113 				            break;
114 				        }
115 				    }
116 				    if (!matches) {
117 				        continue;
118 				    }
119 				    
120 					String expression = null;
121 					if (opinion.getOpinionExpression() == null) {
122 						continue;
123 					}
124 
125 					HashMap<String, Integer> indexes = new HashMap<>();
126 					indexes.put("holder-start", -1);
127 					indexes.put("holder-end", -1);
128 					indexes.put("target-start", -1);
129 					indexes.put("target-end", -1);
130 
131 					expression = opinion.getExpressionSpan().getStr();
132 					indexes.put("expression-start", opinion.getExpressionSpan().getTargets().get(0).getOffset());
133 					indexes.put("expression-end", opinion.getExpressionSpan().getTargets().get(opinion.getExpressionSpan().getTargets().size() - 1).getOffset() +
134 							opinion.getExpressionSpan().getTargets().get(opinion.getExpressionSpan().getTargets().size() - 1).getLength());
135 
136 					String holder = null;
137 					if (opinion.getOpinionHolder() != null && !opinion.getOpinionHolder().getTerms().isEmpty()) {
138 						holder = opinion.getHolderSpan().getStr();
139 						indexes.put("holder-start", opinion.getHolderSpan().getTargets().get(0).getOffset());
140 						indexes.put("holder-end", opinion.getHolderSpan().getTargets().get(opinion.getHolderSpan().getTargets().size() - 1).getOffset() +
141 								opinion.getHolderSpan().getTargets().get(opinion.getHolderSpan().getTargets().size() - 1).getLength());
142 					}
143 					else {
144 						holder = "null";
145 					}
146 
147 					String target = null;
148 					if (opinion.getOpinionTarget() != null && !opinion.getOpinionTarget().getTerms().isEmpty()) {
149 						target = opinion.getTargetSpan().getStr();
150 						indexes.put("target-start", opinion.getTargetSpan().getTargets().get(0).getOffset());
151 						indexes.put("target-end", opinion.getTargetSpan().getTargets().get(opinion.getTargetSpan().getTargets().size() - 1).getOffset() +
152 								opinion.getTargetSpan().getTargets().get(opinion.getTargetSpan().getTargets().size() - 1).getLength());
153 					}
154 					else {
155 						target = "null";
156 					}
157 
158 					Element frameElement = doc.createElement("frame");
159 
160 					Element holderElement = doc.createElement("holder");
161 					holderElement.setAttribute("value", holder);
162 					holderElement.setAttribute("start", Integer.toString(indexes.get("holder-start")));
163 					holderElement.setAttribute("end", Integer.toString(indexes.get("holder-end")));
164 
165 					Element topicElement = doc.createElement("topic");
166 					topicElement.setAttribute("value", target);
167 					topicElement.setAttribute("start", Integer.toString(indexes.get("target-start")));
168 					topicElement.setAttribute("end", Integer.toString(indexes.get("target-end")));
169 
170 					Element opinionElement = doc.createElement("opinion");
171 					opinionElement.setAttribute("value", expression);
172 					opinionElement.setAttribute("start", Integer.toString(indexes.get("expression-start")));
173 					opinionElement.setAttribute("end", Integer.toString(indexes.get("expression-end")));
174 					Element polarityElement = doc.createElement("polarity");
175 					polarityElement.appendChild(doc.createTextNode(opinion.getPolarity() != null ? normalizePolarity(opinion.getPolarity()) : "neutral"));
176 					opinionElement.appendChild(polarityElement);
177 
178 					frameElement.appendChild(holderElement);
179 					frameElement.appendChild(topicElement);
180 					frameElement.appendChild(opinionElement);
181 					sentenceElement.appendChild(frameElement);
182 				}
183 			}
184 
185 			LOGGER.info("Read {} files", fileNum);
186 
187 			TransformerFactory transformerFactory = TransformerFactory.newInstance();
188 			Transformer transformer = transformerFactory.newTransformer();
189 			transformer.setOutputProperty(OutputKeys.INDENT, "yes");
190 			transformer.setOutputProperty("{http://xml.apache.org/xslt}indent-amount", "4");
191 			transformer.setOutputProperty(OutputKeys.ENCODING, "UTF-8");
192 
193 			DOMSource source = new DOMSource(doc);
194 
195 			StreamResult result = new StreamResult(outputFile);
196 //			StreamResult result = new StreamResult(System.out);
197 
198 			transformer.transform(source, result);
199 
200 		} catch (final Throwable ex) {
201 			CommandLine.fail(ex);
202 		}
203 	}
204 	
205 	private static String normalizePolarity( String polarity) {
206         String p = polarity.toLowerCase();
207         if (p.contains("pos")) {
208             return "positive";
209         } else if (p.contains("neg")) {
210             return "negative";
211         } else {
212             return "neutral";
213         }
214     }
215 	
216 }