1   package eu.fbk.dkm.pikes.resources.ontonotes;
2   
3   import com.google.common.io.Files;
4   import eu.fbk.utils.core.CommandLine;
5   import eu.fbk.utils.core.FrequencyHashSet;
6   import org.apache.commons.io.Charsets;
7   import org.slf4j.LoggerFactory;
8   import org.xml.sax.Attributes;
9   import org.xml.sax.InputSource;
10  import org.xml.sax.SAXException;
11  import org.xml.sax.helpers.DefaultHandler;
12  
13  import javax.xml.XMLConstants;
14  import javax.xml.parsers.ParserConfigurationException;
15  import javax.xml.parsers.SAXParser;
16  import javax.xml.parsers.SAXParserFactory;
17  import javax.xml.xpath.XPathExpressionException;
18  import java.io.*;
19  import java.nio.charset.Charset;
20  import java.util.HashMap;
21  import java.util.HashSet;
22  import java.util.List;
23  
24  /**
25   * Created by alessio on 20/08/15.
26   */
27  
28  public class VerbNetStatisticsExtractor {
29  
30  	File ontonotesDir = null, senseDir = null;
31  	FrequencyHashSet<String> vnTotals = new FrequencyHashSet<>();
32  	FrequencyHashSet<String> fnTotals = new FrequencyHashSet<>();
33  
34  	public VerbNetStatisticsExtractor() {
35  
36  	}
37  
38  	public static void main(String[] args) {
39  
40  		try {
41  			final CommandLine cmd = CommandLine
42  					.parser()
43  					.withName("VerbNetStatisticsExtractor")
44  					.withHeader("Extracts statistics from OntoNotes on frequency of VerbNet/FrameNet")
45  					.withOption("n", "ontonotes", "OntoNotes folder", "FOLDER", CommandLine.Type.DIRECTORY_EXISTING, true, false, true)
46  					.withOption("o", "output", "output file", "FILE", CommandLine.Type.FILE, true, false, true)
47  					.withLogger(LoggerFactory.getLogger("eu.fbk.nafview")).parse(args);
48  
49  			final File dir = cmd.getOptionValue("n", File.class);
50  			final File output = cmd.getOptionValue("o", File.class);
51  
52  			VerbNetStatisticsExtractor statisticsExtractor = new VerbNetStatisticsExtractor();
53  			statisticsExtractor.loadDir(dir.getAbsolutePath());
54  			try {
55  				statisticsExtractor.loadFrequencies();
56  			} catch (Exception e) {
57  				e.printStackTrace();
58  			}
59  
60  			BufferedWriter writer = new BufferedWriter(new FileWriter(output));
61  			for (String key : statisticsExtractor.getVnTotals().keySet()) {
62  				writer.append("VN").append("\t").append(key).append("\t").append(statisticsExtractor.getVnTotals().get(key).toString()).append("\n");
63  			}
64  			for (String key : statisticsExtractor.getFnTotals().keySet()) {
65  				writer.append("FN").append("\t").append(key).append("\t").append(statisticsExtractor.getFnTotals().get(key).toString()).append("\n");
66  			}
67  			writer.close();
68  
69  		} catch (final Throwable ex) {
70  			CommandLine.fail(ex);
71  		}
72  	}
73  
74  	public void loadDir(String onDir) {
75  		ontonotesDir = new File(onDir + "/data/files/data/english/annotations/");
76  		senseDir = new File(onDir + "/data/files/data/english/metadata/sense-inventories/");
77  	}
78  
79  	public FrequencyHashSet<String> getVnTotals() {
80  		return vnTotals;
81  	}
82  
83  	public FrequencyHashSet<String> getFnTotals() {
84  		return fnTotals;
85  	}
86  
87  	public void loadFrequencies(String fileName) throws IOException {
88  		vnTotals = new FrequencyHashSet<>();
89  		fnTotals = new FrequencyHashSet<>();
90  
91  		List<String> lines = Files.readLines(new File(fileName), Charset.defaultCharset());
92  		for (String line : lines) {
93  			line = line.trim();
94  			if (line.length() == 0) {
95  				continue;
96  			}
97  			if (line.startsWith("#")) {
98  				continue;
99  			}
100 
101 			String[] parts = line.split("\\s+");
102 			if (parts.length < 3) {
103 				continue;
104 			}
105 
106 			if (parts[0].equals("FN")) {
107 				fnTotals.add(parts[1], Integer.parseInt(parts[2]));
108 			}
109 			if (parts[0].equals("VN")) {
110 				vnTotals.add(parts[1], Integer.parseInt(parts[2]));
111 			}
112 		}
113 
114 	}
115 
116 	public void loadFrequencies() throws IOException, XPathExpressionException, ParserConfigurationException, SAXException {
117 
118 		if (ontonotesDir == null || senseDir == null) {
119 			return;
120 		}
121 
122 		HashMap<String, HashSet<String>> vnMappings = new HashMap<>();
123 		HashMap<String, HashSet<String>> fnMappings = new HashMap<>();
124 		vnTotals = new FrequencyHashSet<>();
125 		fnTotals = new FrequencyHashSet<>();
126 
127 		DefaultHandler handler = new DefaultHandler() {
128 
129 			String senseID = null;
130 			String lemma = null;
131 			boolean inVn = false;
132 			boolean inFn = false;
133 
134 			public void startElement(String uri, String localName, String qName, Attributes attributes) throws SAXException {
135 				if (qName.equals("sense")) {
136 					senseID = attributes.getValue("n");
137 				}
138 				if (qName.equals("inventory")) {
139 					lemma = attributes.getValue("lemma");
140 				}
141 				if (qName.equals("vn")) {
142 					inVn = true;
143 				}
144 				if (qName.equals("fn")) {
145 					inFn = true;
146 				}
147 			}
148 
149 			public void endElement(String uri, String localName, String qName) throws SAXException {
150 				if (qName.equals("vn")) {
151 					inVn = false;
152 				}
153 				if (qName.equals("fn")) {
154 					inFn = false;
155 				}
156 			}
157 
158 			public void characters(char ch[], int start, int length) throws SAXException {
159 				if (inVn || inFn) {
160 					String value = new String(ch, start, length);
161 
162 					if (value.trim().length() > 0 &&
163 							!value.equals("NM") &&
164 							!value.equals("NP")) {
165 
166 						String key = lemma + "-" + senseID;
167 
168 						String[] parts = value.split("[,\\s]+");
169 						for (String part : parts) {
170 							part = part.trim().toLowerCase();
171 
172 							if (inVn) {
173 								if (!vnMappings.containsKey(key)) {
174 									vnMappings.put(key, new HashSet<>());
175 								}
176 								vnMappings.get(key).add(part);
177 							}
178 							if (inFn) {
179 								if (!fnMappings.containsKey(key)) {
180 									fnMappings.put(key, new HashSet<>());
181 								}
182 								fnMappings.get(key).add(part);
183 							}
184 						}
185 					}
186 				}
187 			}
188 
189 		};
190 
191 		SAXParserFactory spf = SAXParserFactory.newInstance();
192 
193 		spf.setFeature(XMLConstants.FEATURE_SECURE_PROCESSING, true);
194 		spf.setFeature("http://apache.org/xml/features/nonvalidating/load-external-dtd", false);
195 		spf.setFeature("http://xml.org/sax/features/validation", false);
196 
197 		for (File f : Files.fileTreeTraverser().preOrderTraversal(senseDir)) {
198 			if (f.isDirectory()) {
199 				continue;
200 			}
201 			if (!f.getAbsolutePath().endsWith(".xml")) {
202 				continue;
203 			}
204 
205 			String xml = Files.toString(f, Charset.defaultCharset());
206 
207 			SAXParser saxParser = spf.newSAXParser();
208 
209 			InputSource is = new InputSource(new StringReader(xml));
210 
211 			try {
212 				saxParser.parse(is, handler);
213 			} catch (Exception e) {
214 				System.err.println("Error in file " + f);
215 			}
216 		}
217 
218 		for (File f : Files.fileTreeTraverser().preOrderTraversal(ontonotesDir)) {
219 
220 			if (f.isDirectory()) {
221 				continue;
222 			}
223 			if (!f.getAbsolutePath().endsWith(".sense")) {
224 				continue;
225 			}
226 
227 			List<String> lines = Files.readLines(f, Charsets.UTF_8);
228 			for (String line : lines) {
229 				line = line.trim();
230 				if (line.length() == 0) {
231 					continue;
232 				}
233 
234 				String[] parts = line.split("\\s+");
235 				String lemma = parts[3];
236 				String sense = parts[parts.length - 1];
237 
238 				String key = lemma + "-" + sense;
239 				if (vnMappings.get(key) != null) {
240 					for (String vn : vnMappings.get(key)) {
241 						vnTotals.add(vn);
242 					}
243 				}
244 				if (fnMappings.get(key) != null) {
245 					for (String fn : fnMappings.get(key)) {
246 						fnTotals.add(fn);
247 					}
248 				}
249 			}
250 		}
251 	}
252 
253 }