1   package eu.fbk.dkm.pikes.resources.goodbadfor;
2   
3   import eu.fbk.dkm.pikes.resources.mpqa.Record;
4   import eu.fbk.dkm.pikes.resources.mpqa.RecordSet;
5   import eu.fbk.utils.core.CommandLine;
6   import ixa.kaflib.KAFDocument;
7   import ixa.kaflib.Opinion;
8   import ixa.kaflib.Term;
9   import org.apache.commons.io.FileUtils;
10  import org.apache.commons.io.FilenameUtils;
11  import org.apache.commons.lang.StringEscapeUtils;
12  import org.slf4j.Logger;
13  import org.slf4j.LoggerFactory;
14  
15  import javax.xml.stream.XMLStreamException;
16  import java.io.File;
17  import java.io.IOException;
18  import java.util.ArrayList;
19  import java.util.HashMap;
20  import java.util.Iterator;
21  import java.util.List;
22  
23  /**
24   * Created by alessio on 24/03/15.
25   */
26  
27  public class CorpusAnnotator {
28  
29  	private static final Logger LOGGER = LoggerFactory.getLogger(CorpusAnnotator.class);
30  
31  	public static void main(final String[] args) throws IOException, XMLStreamException {
32  		try {
33  			final CommandLine cmd = CommandLine
34  					.parser()
35  					.withName("eu.fbk.dkm.pikes.resources.goodbadfor-annotator")
36  					.withHeader("Annotated files with goodFor/badFor annotations")
37  					.withOption("i", "input-path", "the base path of the corpus", "DIR", CommandLine.Type.DIRECTORY_EXISTING, true, false, true)
38  					.withOption("t", "test", "test only on this file", "FILE", CommandLine.Type.STRING, true, false, false)
39  					.withOption("f", "force", "Force opinion")
40  					.withOption("s", "skip", "Skip writing files and show them")
41  					.withLogger(LoggerFactory.getLogger("eu.fbk.fssa")).parse(args);
42  
43  			final File inputPath = cmd.getOptionValue("i", File.class);
44  
45  			File annotationsFolder = new File(inputPath.getAbsolutePath() + File.separator + "MPQA" + File.separator);
46  			File nafFolder = new File(inputPath.getAbsolutePath() + File.separator + "NAF-parsed" + File.separator);
47  //			File documentsFolder = new File(inputPath.getAbsolutePath() + File.separator + "GATE" + File.separator);
48  
49  			boolean forceOpinion = false;
50  			if (cmd.hasOption("force")) {
51  				forceOpinion = true;
52  			}
53  
54  			boolean skip = false;
55  			if (cmd.hasOption("skip")) {
56  				skip = true;
57  			}
58  
59  			String testFile = cmd.getOptionValue("t", String.class);
60  
61  			if (!annotationsFolder.exists()) {
62  				LOGGER.error("Folder {} does not exist", annotationsFolder.getAbsolutePath());
63  			}
64  
65  			if (!nafFolder.exists()) {
66  				LOGGER.error("Folder {} does not exist", nafFolder.getAbsolutePath());
67  			}
68  
69  //			if (!documentsFolder.exists()) {
70  //				LOGGER.error("Folder {} does not exist", documentsFolder.getAbsolutePath());
71  //			}
72  
73  			Iterator<File> fileIterator;
74  			fileIterator = FileUtils.iterateFiles(nafFolder, new String[]{"naf"}, false);
75  			while (fileIterator.hasNext()) {
76  				File file = fileIterator.next();
77  				String fileBaseName = FilenameUtils.removeExtension(file.getName());
78  
79  				if (testFile != null && !testFile.equals(fileBaseName)) {
80  					continue;
81  				}
82  
83  				File mpqaFile = new File(annotationsFolder.getAbsolutePath() + File.separator + fileBaseName + ".eu.fbk.dkm.pikes.resources.mpqa");
84  //				File gateFile = new File(documentsFolder.getAbsolutePath() + File.separator + fileBaseName + ".xml");
85  
86  //				String xmlText = CorpusLoader.getTextFromGateFile(gateFile);
87  
88  				LOGGER.info(String.format("Loading file %s", mpqaFile));
89  				if (!mpqaFile.exists()) {
90  					LOGGER.warn("File {} does not exist", mpqaFile.getAbsolutePath());
91  					continue;
92  				}
93  
94  				String text = "";
95  				LOGGER.info(String.format("Loading file %s", file));
96  				KAFDocument document = KAFDocument.createFromFile(file);
97  				text = document.getRawText();
98  				text = StringEscapeUtils.unescapeHtml(text);
99  				List<Term> terms = document.getTerms();
100 
101 				// Check if there are already opinions
102 				List<Opinion> opinions = document.getOpinions();
103 				if (opinions.size() > 0 && !forceOpinion) {
104 					LOGGER.info("Opinions already present, skipping...");
105 					continue;
106 				}
107 
108 				final RecordSet annotations = RecordSet.readFromFile(mpqaFile);
109 
110 				HashMap<String, Record> index = new HashMap<>();
111 
112 				for (Record record : annotations.getRecords()) {
113 
114 					String span1 = record.getSpan().apply(text);
115 					String span2 = record.getValue("span");
116 
117 					if (span1 == null || span2 == null) {
118 						continue;
119 					}
120 
121 					span1 = StringEscapeUtils.unescapeHtml(span1);
122 					span2 = StringEscapeUtils.unescapeHtml(span2);
123 
124 					String span1OnlyLetters = span1.replaceAll("[^0-9a-zA-Z]", "");
125 					String span2OnlyLetters = span2.replaceAll("[^0-9a-zA-Z]", "");
126 
127 					if (!span1OnlyLetters.equals(span2OnlyLetters)) {
128 						LOGGER.trace(span1);
129 						LOGGER.trace(span2);
130 						LOGGER.warn("The span is different, skipping");
131 						continue;
132 					}
133 
134 					String id = record.getValue("id");
135 					if (id == null) {
136 						LOGGER.warn("ID is null");
137 						continue;
138 					}
139 
140 //					if (index.containsKey(id)) {
141 //						LOGGER.warn("ID {} already exist", id);
142 //						continue;
143 //					}
144 
145 					index.put(id, record);
146 				}
147 
148 				for (Record record : annotations.getRecords()) {
149 					String type = record.getName();
150 					if (type == null) {
151 						continue;
152 					}
153 					if (type.equals("gfbf") || type.equals("influencer")) {
154 
155 						String label = "gold-" + type;
156 						String attribute = "polarity";
157 						if (type.equals("influencer")) {
158 							attribute = "effect";
159 						}
160 
161 						LOGGER.debug(record.toString());
162 
163 						try {
164 							Record agent = index.get(record.getValue("agent"));
165 							Record target = index.get(record.getValue("object"));
166 
167 							List<Term> attitudeSpan = new ArrayList<>();
168 							List<Term> targetSpan = new ArrayList<>();
169 							List<Term> sourceSpan = new ArrayList<>();
170 
171 							attitudeSpan.addAll(eu.fbk.dkm.pikes.resources.mpqa.CorpusAnnotator.getSpan(terms, record.getSpan()));
172 
173 							Opinion opinion = document.newOpinion();
174 							opinion.setLabel(label);
175 
176 							if (agent != null) {
177 								sourceSpan.addAll(eu.fbk.dkm.pikes.resources.mpqa.CorpusAnnotator.getSpan(terms, agent.getSpan()));
178 								if (sourceSpan.size() > 0) {
179 									Opinion.OpinionHolder opinionHolder = opinion.createOpinionHolder(KAFDocument.newTermSpan(sourceSpan));
180 									String attitude = agent.getValue("writerAttitude");
181 									if (attitude != null) {
182 										opinionHolder.setType(attitude);
183 									}
184 								}
185 							}
186 
187 							if (target != null) {
188 								targetSpan.addAll(eu.fbk.dkm.pikes.resources.mpqa.CorpusAnnotator.getSpan(terms, target.getSpan()));
189 								if (targetSpan.size() > 0) {
190 									Opinion.OpinionTarget opinionTarget = opinion.createOpinionTarget(KAFDocument.newTermSpan(targetSpan));
191 									String attitude = target.getValue("writerAttitude");
192 									if (attitude != null) {
193 										opinionTarget.setType(attitude);
194 									}
195 								}
196 							}
197 
198 
199 							if (attitudeSpan.size() > 0) {
200 								opinion.createOpinionExpression(KAFDocument.newTermSpan(attitudeSpan));
201 								opinion.getOpinionExpression().setPolarity(record.getValue(attribute));
202 							}
203 
204 						} catch (Exception e) {
205 							LOGGER.warn(e.getMessage());
206 							e.printStackTrace();
207 						}
208 
209 
210 					}
211 				}
212 
213 				if (skip) {
214 					System.out.println(document);
215 				}
216 				else {
217 					document.save(file.getAbsolutePath());
218 				}
219 			}
220 
221 		} catch (final Throwable ex) {
222 			CommandLine.fail(ex);
223 		}
224 	}
225 
226 
227 }