1   package eu.fbk.dkm.pikes.resources.darmstadt;
2   
3   import com.google.common.base.Charsets;
4   import com.google.common.io.Files;
5   import eu.fbk.dkm.pikes.resources.NAFFilter;
6   import eu.fbk.utils.core.CommandLine;
7   import ixa.kaflib.KAFDocument;
8   import ixa.kaflib.Opinion;
9   import ixa.kaflib.Span;
10  import ixa.kaflib.Term;
11  import org.apache.commons.io.FileUtils;
12  import org.apache.commons.io.FilenameUtils;
13  import org.slf4j.LoggerFactory;
14  import org.w3c.dom.Document;
15  import org.w3c.dom.Element;
16  import org.w3c.dom.NamedNodeMap;
17  import org.w3c.dom.NodeList;
18  
19  import javax.xml.parsers.DocumentBuilder;
20  import javax.xml.parsers.DocumentBuilderFactory;
21  import java.io.ByteArrayInputStream;
22  import java.io.File;
23  import java.util.HashMap;
24  import java.util.HashSet;
25  import java.util.Iterator;
26  import java.util.regex.Matcher;
27  import java.util.regex.Pattern;
28  
29  /**
30   * Created by alessio on 10/04/15.
31   */
32  
33  public class CorpusAnnotator {
34  
35  	private static final org.slf4j.Logger LOGGER = LoggerFactory.getLogger(CorpusAnnotator.class);
36  	private static Pattern spanPattern = Pattern.compile("word_([0-9]+)");
37  	private static String TERM_PREFIX = "t";
38  
39  	private static void getFilesRecursive(File pFile, HashSet<String> folders) {
40  		for (File file : pFile.listFiles()) {
41  			if (file.isDirectory()) {
42  				folders.add(file.getAbsolutePath());
43  				getFilesRecursive(file, folders);
44  			}
45  		}
46  	}
47  
48  	private static Integer getTermFromSpan(String span) {
49  		Matcher matcher = spanPattern.matcher(span);
50  		if (matcher.find()) {
51  			Integer id = Integer.parseInt(matcher.group(1));
52  			return id - 1;
53  		}
54  
55  		return null;
56  	}
57  
58  	private static Span<Term> getTermsFromSpan(KAFDocument document, String span) {
59  		String[] parts = span.split("[^a-z0-9A-Z_]+");
60  		Span<Term> termSpan = KAFDocument.newTermSpan();
61  
62  		if (parts.length == 1) {
63  			Integer id = getTermFromSpan(parts[0]);
64  			termSpan.addTarget(document.getTerms().get(id));
65  		}
66  		else if (parts.length > 1) {
67  			Integer id1 = getTermFromSpan(parts[0]);
68  			Integer id2 = getTermFromSpan(parts[parts.length - 1]);
69  			for (int i = id1; i <= id2; i++) {
70  				termSpan.addTarget(document.getTerms().get(i));
71  			}
72  		}
73  
74  		return termSpan;
75  	}
76  
77  	public static void main(String[] args) {
78  		try {
79  			final CommandLine cmd = CommandLine
80  					.parser()
81  					.withName("eu.fbk.dkm.pikes.resources.darmstadt-loader")
82  					.withHeader("Load eu.fbk.dkm.pikes.resources.darmstadt-service-review-corpus")
83  					.withOption("i", "input-folder", "the folder of the corpus", "DIR", CommandLine.Type.DIRECTORY_EXISTING, true, false, true)
84  					.withOption("f", "force", "Force opinion")
85  					.withLogger(LoggerFactory.getLogger("eu.fbk.fssa")).parse(args);
86  
87  			final File inputFile = cmd.getOptionValue("i", File.class);
88  			boolean forceOpinion = cmd.hasOption("f");
89  
90  			DocumentBuilderFactory dbFactory = DocumentBuilderFactory.newInstance();
91  			dbFactory.setValidating(false);
92  			dbFactory.setNamespaceAware(true);
93  			dbFactory.setFeature("http://xml.org/sax/features/namespaces", false);
94  			dbFactory.setFeature("http://xml.org/sax/features/validation", false);
95  			dbFactory.setFeature("http://apache.org/xml/features/nonvalidating/load-dtd-grammar", false);
96  			dbFactory.setFeature("http://apache.org/xml/features/nonvalidating/load-external-dtd", false);
97  			DocumentBuilder dBuilder;
98  			Document doc;
99  
100 			HashSet<String> folders = new HashSet<>();
101 			getFilesRecursive(inputFile, folders);
102 
103 			HashSet<String> okFolders = new HashSet<>();
104 			okLoop:
105 			for (String folder : folders) {
106 				for (String pattern : CorpusLoader.MMAX_PATTERN) {
107 					StringBuffer newFolder = new StringBuffer();
108 					newFolder.append(folder);
109 					newFolder.append(File.separator);
110 					newFolder.append(pattern);
111 
112 					if (!folders.contains(newFolder.toString())) {
113 						continue okLoop;
114 					}
115 				}
116 
117 				okFolders.add(folder);
118 			}
119 
120 			for (String folder : okFolders) {
121 				LOGGER.info("Entering folder {}", folder);
122 
123 				String markableDir = folder + File.separator + CorpusLoader.MMAX_PATTERN[1];
124 				String basedataDir = folder + File.separator + CorpusLoader.MMAX_PATTERN[0];
125 				File nafDir = new File(folder + File.separator + "naf-parsed");
126 
127 				Iterator<File> fileIterator;
128 				fileIterator = FileUtils.iterateFiles(nafDir, new String[]{"naf"}, false);
129 				while (fileIterator.hasNext()) {
130 					File file = fileIterator.next();
131 					String fileBaseName = FilenameUtils.removeExtension(file.getName());
132 					LOGGER.info(fileBaseName);
133 
134 					File annotatedFile = new File(markableDir + File.separator + fileBaseName + CorpusLoader.MMAX_SUFFIXES[1] + ".xml");
135 					if (!annotatedFile.exists()) {
136 						LOGGER.warn("File {} does not exist", annotatedFile.getAbsolutePath());
137 						continue;
138 					}
139 
140 					File basedataFile = new File(basedataDir + File.separator + fileBaseName + CorpusLoader.MMAX_SUFFIXES[0] + ".xml");
141 					if (!basedataFile.exists()) {
142 						LOGGER.warn("File {} does not exist", basedataFile.getAbsolutePath());
143 						continue;
144 					}
145 
146 					KAFDocument document = KAFDocument.createFromFile(file);
147 
148 					boolean hasGoldOpinions = false;
149 					for (Opinion opinion : document.getOpinions()) {
150 					    if ("gold-eu.fbk.dkm.pikes.resources.darmstadt".equals(opinion.getLabel())) {
151 					        hasGoldOpinions = true;
152 					        break;
153 					    }
154 					}
155 					
156 					if (hasGoldOpinions && !forceOpinion) {
157 					    LOGGER.info("Opinions already present, skipping...");
158 					
159 					} else {
160 					    String fileContent;
161     					fileContent = Files.toString(basedataFile, Charsets.UTF_8);
162     					fileContent = fileContent.replaceAll("&", "&amp;");
163     					dBuilder = dbFactory.newDocumentBuilder();
164     					doc = dBuilder.parse(new ByteArrayInputStream(fileContent.getBytes(Charsets.UTF_8)));
165     
166     					int origWordCount = doc.getElementsByTagName("word").getLength();
167     					int nafWordCount = document.getWFs().size();
168     
169     					if (origWordCount != nafWordCount) {
170     						LOGGER.warn("Word counts differ ({}/{})", origWordCount, nafWordCount);
171     					}
172     
173     					HashMap<String, HashMap<String, String>> markables = new HashMap<>();
174     
175     					fileContent = Files.toString(annotatedFile, Charsets.UTF_8);
176     					dBuilder = dbFactory.newDocumentBuilder();
177     					doc = dBuilder.parse(new ByteArrayInputStream(fileContent.getBytes(Charsets.UTF_8)));
178     					NodeList nList = doc.getElementsByTagName("markable");
179     					for (int temp = 0; temp < nList.getLength(); temp++) {
180     						Element nNode = (Element) nList.item(temp);
181     						NamedNodeMap attributes = nNode.getAttributes();
182     						if (attributes != null) {
183     							HashMap<String, String> thisMarkable = new HashMap<>();
184     							for (int i = 0; i < attributes.getLength(); i++) {
185     								thisMarkable.put(attributes.item(i).getNodeName(), attributes.item(i).getNodeValue());
186     							}
187     
188     							if (thisMarkable.get("id") != null) {
189     								markables.put(thisMarkable.get("id"), thisMarkable);
190     							}
191     						}
192     					}
193     
194     					for (HashMap<String, String> markable : markables.values()) {
195     						if (markable.get("annotation_type").equals("opinionexpression")) {
196     
197     							String holderString = markable.get("opinionholder");
198     							String targetString = markable.get("opiniontarget");
199     
200     							HashMap<String, String> holder = null;
201     							HashMap<String, String> target = null;
202     
203     							if (holderString != null && !holderString.equals("empty")) {
204     								holder = markables.get(holderString);
205     							}
206     							if (targetString != null && !targetString.equals("empty")) {
207     								target = markables.get(targetString);
208     							}
209     
210     							Span<Term> termSpan;
211     
212     							try {
213     								termSpan = getTermsFromSpan(document, markable.get("span"));
214     							} catch (Exception e) {
215     								continue;
216     							}
217     
218     							Opinion opinion = document.createOpinion();
219     							opinion.setLabel("gold-eu.fbk.dkm.pikes.resources.darmstadt");
220     							Opinion.OpinionExpression expression = opinion.createOpinionExpression(termSpan);
221     							if (markable.get("polarity") != null) {
222     								expression.setPolarity(markable.get("polarity"));
223     							}
224     							if (markable.get("strength") != null) {
225     								expression.setStrength(markable.get("strength"));
226     							}
227     
228     							if (holder != null) {
229     								Span<Term> terms = getTermsFromSpan(document, holder.get("span"));
230     								opinion.createOpinionHolder(terms);
231     							}
232     							if (target != null) {
233     								Span<Term> terms = getTermsFromSpan(document, target.get("span"));
234     								opinion.createOpinionTarget(terms);
235     							}
236     						}
237     					}
238 					}
239 					
240                     NAFFilter.builder(false).withSRLRoleLinking(true, true)
241                             .withOpinionLinking(true, true).build().filter(document);
242 					
243 					document.save(file.getAbsolutePath());
244 				}
245 			}
246 
247 		} catch (final Throwable ex) {
248 			CommandLine.fail(ex);
249 		}
250 	}
251 
252 }