1 package eu.fbk.dkm.pikes.resources.ontonotes;
2
3 import com.google.common.io.Files;
4 import eu.fbk.utils.core.CommandLine;
5 import eu.fbk.utils.core.FrequencyHashSet;
6 import org.apache.commons.io.Charsets;
7 import org.slf4j.LoggerFactory;
8 import org.xml.sax.Attributes;
9 import org.xml.sax.InputSource;
10 import org.xml.sax.SAXException;
11 import org.xml.sax.helpers.DefaultHandler;
12
13 import javax.xml.XMLConstants;
14 import javax.xml.parsers.ParserConfigurationException;
15 import javax.xml.parsers.SAXParser;
16 import javax.xml.parsers.SAXParserFactory;
17 import javax.xml.xpath.XPathExpressionException;
18 import java.io.*;
19 import java.nio.charset.Charset;
20 import java.util.HashMap;
21 import java.util.HashSet;
22 import java.util.List;
23
24
25
26
27
28 public class VerbNetStatisticsExtractor {
29
30 File ontonotesDir = null, senseDir = null;
31 FrequencyHashSet<String> vnTotals = new FrequencyHashSet<>();
32 FrequencyHashSet<String> fnTotals = new FrequencyHashSet<>();
33
34 public VerbNetStatisticsExtractor() {
35
36 }
37
38 public static void main(String[] args) {
39
40 try {
41 final CommandLine cmd = CommandLine
42 .parser()
43 .withName("VerbNetStatisticsExtractor")
44 .withHeader("Extracts statistics from OntoNotes on frequency of VerbNet/FrameNet")
45 .withOption("n", "ontonotes", "OntoNotes folder", "FOLDER", CommandLine.Type.DIRECTORY_EXISTING, true, false, true)
46 .withOption("o", "output", "output file", "FILE", CommandLine.Type.FILE, true, false, true)
47 .withLogger(LoggerFactory.getLogger("eu.fbk.nafview")).parse(args);
48
49 final File dir = cmd.getOptionValue("n", File.class);
50 final File output = cmd.getOptionValue("o", File.class);
51
52 VerbNetStatisticsExtractor statisticsExtractor = new VerbNetStatisticsExtractor();
53 statisticsExtractor.loadDir(dir.getAbsolutePath());
54 try {
55 statisticsExtractor.loadFrequencies();
56 } catch (Exception e) {
57 e.printStackTrace();
58 }
59
60 BufferedWriter writer = new BufferedWriter(new FileWriter(output));
61 for (String key : statisticsExtractor.getVnTotals().keySet()) {
62 writer.append("VN").append("\t").append(key).append("\t").append(statisticsExtractor.getVnTotals().get(key).toString()).append("\n");
63 }
64 for (String key : statisticsExtractor.getFnTotals().keySet()) {
65 writer.append("FN").append("\t").append(key).append("\t").append(statisticsExtractor.getFnTotals().get(key).toString()).append("\n");
66 }
67 writer.close();
68
69 } catch (final Throwable ex) {
70 CommandLine.fail(ex);
71 }
72 }
73
74 public void loadDir(String onDir) {
75 ontonotesDir = new File(onDir + "/data/files/data/english/annotations/");
76 senseDir = new File(onDir + "/data/files/data/english/metadata/sense-inventories/");
77 }
78
79 public FrequencyHashSet<String> getVnTotals() {
80 return vnTotals;
81 }
82
83 public FrequencyHashSet<String> getFnTotals() {
84 return fnTotals;
85 }
86
87 public void loadFrequencies(String fileName) throws IOException {
88 vnTotals = new FrequencyHashSet<>();
89 fnTotals = new FrequencyHashSet<>();
90
91 List<String> lines = Files.readLines(new File(fileName), Charset.defaultCharset());
92 for (String line : lines) {
93 line = line.trim();
94 if (line.length() == 0) {
95 continue;
96 }
97 if (line.startsWith("#")) {
98 continue;
99 }
100
101 String[] parts = line.split("\\s+");
102 if (parts.length < 3) {
103 continue;
104 }
105
106 if (parts[0].equals("FN")) {
107 fnTotals.add(parts[1], Integer.parseInt(parts[2]));
108 }
109 if (parts[0].equals("VN")) {
110 vnTotals.add(parts[1], Integer.parseInt(parts[2]));
111 }
112 }
113
114 }
115
116 public void loadFrequencies() throws IOException, XPathExpressionException, ParserConfigurationException, SAXException {
117
118 if (ontonotesDir == null || senseDir == null) {
119 return;
120 }
121
122 HashMap<String, HashSet<String>> vnMappings = new HashMap<>();
123 HashMap<String, HashSet<String>> fnMappings = new HashMap<>();
124 vnTotals = new FrequencyHashSet<>();
125 fnTotals = new FrequencyHashSet<>();
126
127 DefaultHandler handler = new DefaultHandler() {
128
129 String senseID = null;
130 String lemma = null;
131 boolean inVn = false;
132 boolean inFn = false;
133
134 public void startElement(String uri, String localName, String qName, Attributes attributes) throws SAXException {
135 if (qName.equals("sense")) {
136 senseID = attributes.getValue("n");
137 }
138 if (qName.equals("inventory")) {
139 lemma = attributes.getValue("lemma");
140 }
141 if (qName.equals("vn")) {
142 inVn = true;
143 }
144 if (qName.equals("fn")) {
145 inFn = true;
146 }
147 }
148
149 public void endElement(String uri, String localName, String qName) throws SAXException {
150 if (qName.equals("vn")) {
151 inVn = false;
152 }
153 if (qName.equals("fn")) {
154 inFn = false;
155 }
156 }
157
158 public void characters(char ch[], int start, int length) throws SAXException {
159 if (inVn || inFn) {
160 String value = new String(ch, start, length);
161
162 if (value.trim().length() > 0 &&
163 !value.equals("NM") &&
164 !value.equals("NP")) {
165
166 String key = lemma + "-" + senseID;
167
168 String[] parts = value.split("[,\\s]+");
169 for (String part : parts) {
170 part = part.trim().toLowerCase();
171
172 if (inVn) {
173 if (!vnMappings.containsKey(key)) {
174 vnMappings.put(key, new HashSet<>());
175 }
176 vnMappings.get(key).add(part);
177 }
178 if (inFn) {
179 if (!fnMappings.containsKey(key)) {
180 fnMappings.put(key, new HashSet<>());
181 }
182 fnMappings.get(key).add(part);
183 }
184 }
185 }
186 }
187 }
188
189 };
190
191 SAXParserFactory spf = SAXParserFactory.newInstance();
192
193 spf.setFeature(XMLConstants.FEATURE_SECURE_PROCESSING, true);
194 spf.setFeature("http://apache.org/xml/features/nonvalidating/load-external-dtd", false);
195 spf.setFeature("http://xml.org/sax/features/validation", false);
196
197 for (File f : Files.fileTreeTraverser().preOrderTraversal(senseDir)) {
198 if (f.isDirectory()) {
199 continue;
200 }
201 if (!f.getAbsolutePath().endsWith(".xml")) {
202 continue;
203 }
204
205 String xml = Files.toString(f, Charset.defaultCharset());
206
207 SAXParser saxParser = spf.newSAXParser();
208
209 InputSource is = new InputSource(new StringReader(xml));
210
211 try {
212 saxParser.parse(is, handler);
213 } catch (Exception e) {
214 System.err.println("Error in file " + f);
215 }
216 }
217
218 for (File f : Files.fileTreeTraverser().preOrderTraversal(ontonotesDir)) {
219
220 if (f.isDirectory()) {
221 continue;
222 }
223 if (!f.getAbsolutePath().endsWith(".sense")) {
224 continue;
225 }
226
227 List<String> lines = Files.readLines(f, Charsets.UTF_8);
228 for (String line : lines) {
229 line = line.trim();
230 if (line.length() == 0) {
231 continue;
232 }
233
234 String[] parts = line.split("\\s+");
235 String lemma = parts[3];
236 String sense = parts[parts.length - 1];
237
238 String key = lemma + "-" + sense;
239 if (vnMappings.get(key) != null) {
240 for (String vn : vnMappings.get(key)) {
241 vnTotals.add(vn);
242 }
243 }
244 if (fnMappings.get(key) != null) {
245 for (String fn : fnMappings.get(key)) {
246 fnTotals.add(fn);
247 }
248 }
249 }
250 }
251 }
252
253 }