1 package eu.fbk.dkm.pikes.tintop.util.framenet;
2
3 import ch.qos.logback.classic.Level;
4 import com.google.common.io.Files;
5 import edu.stanford.nlp.ling.CoreAnnotations;
6 import edu.stanford.nlp.ling.CoreLabel;
7 import edu.stanford.nlp.pipeline.Annotation;
8 import edu.stanford.nlp.pipeline.StanfordCoreNLP;
9 import edu.stanford.nlp.semgraph.SemanticGraph;
10 import edu.stanford.nlp.semgraph.SemanticGraphCoreAnnotations;
11 import edu.stanford.nlp.util.CoreMap;
12 import eu.fbk.dkm.pikes.depparseannotation.DepParseInfo;
13 import eu.fbk.dkm.pikes.depparseannotation.DepparseAnnotations;
14 import org.joox.JOOX;
15 import org.slf4j.Logger;
16 import org.slf4j.LoggerFactory;
17 import org.w3c.dom.Document;
18 import org.w3c.dom.Element;
19
20 import javax.xml.parsers.DocumentBuilder;
21 import javax.xml.parsers.DocumentBuilderFactory;
22 import java.io.BufferedWriter;
23 import java.io.File;
24 import java.io.FileWriter;
25 import java.util.ArrayList;
26 import java.util.List;
27 import java.util.Properties;
28 import java.util.TreeMap;
29
30
31
32
33
34 public class ParseFullTextWithStanford {
35
36 private static final Logger LOGGER = LoggerFactory.getLogger(ParseFullTextWithStanford.class);
37 private static String annotatorsPattern = "tokenize, ssplit, %s, lemma, %s";
38
39
40
41
42
43 static class SpanInformation {
44
45 String label;
46 int start, end;
47
48 public SpanInformation(String label, int start, int end) {
49 this.label = label;
50 this.start = start;
51 this.end = end;
52 }
53
54 @Override public String toString() {
55 return "SpanInformation{" +
56 "label='" + label + '\'' +
57 ", start=" + start +
58 ", end=" + end +
59 '}';
60 }
61 }
62
63 static class FrameInformation {
64
65 SpanInformation target;
66 List<SpanInformation> roles = new ArrayList<>();
67 String luName, frameName;
68
69 public FrameInformation(String luName, String frameName) {
70 this.luName = luName;
71 this.frameName = frameName;
72 }
73
74 public void setTarget(SpanInformation target) {
75 this.target = target;
76 }
77
78 public void addRole(SpanInformation role) {
79 this.roles.add(role);
80 }
81
82 @Override public String toString() {
83 return "FrameInformation{" +
84 "target=" + target +
85 ", roles=" + roles +
86 ", luName='" + luName + '\'' +
87 ", frameName='" + frameName + '\'' +
88 '}';
89 }
90 }
91
92
93
94
95
96
97
98
99
100
101
102
103 public static void main(String[] args) {
104
105 if (args.length < 5) {
106 LOGGER.error("Five arguments needed: "
107 + "fullTextPath, outputFile (parsing), outputFile (frames), posAnnotator, parseAnnotator");
108 }
109
110 String fullTextPath = args[0];
111 String outputFile1 = args[1];
112 String outputFile2 = args[2];
113 String posAnnotator = args[3];
114 String parseAnnotator = args[4];
115
116 String annotators = String.format(annotatorsPattern, posAnnotator, parseAnnotator);
117
118 try {
119
120 ((ch.qos.logback.classic.Logger) LoggerFactory.getLogger("edu.stanford")).setLevel(Level.ERROR);
121
122 File fullTextFile = new File(fullTextPath);
123 BufferedWriter writerLemmas = new BufferedWriter(new FileWriter(outputFile1));
124 BufferedWriter writerFrames = new BufferedWriter(new FileWriter(outputFile2));
125
126 Properties props = new Properties();
127 props.setProperty("annotators", annotators);
128 props.setProperty("customAnnotatorClass.fake_pos", "eu.fbk.fcw.utils.annotators.FakePosAnnotator");
129 props.setProperty("customAnnotatorClass.mst_server",
130 "eu.fbk.fcw.mst.api.MstServerParserAnnotator");
131
132 props.setProperty("tokenize.whitespace", "true");
133 props.setProperty("ssplit.eolonly", "true");
134
135 props.setProperty("mst_server.host", "localhost");
136 props.setProperty("mst_server.port", "8012");
137
138 StanfordCoreNLP pipeline = new StanfordCoreNLP(props);
139
140 DocumentBuilderFactory dbFactory = DocumentBuilderFactory.newInstance();
141 DocumentBuilder dBuilder = dbFactory.newDocumentBuilder();
142
143 int sentNo = -1;
144 for (final File file : Files.fileTreeTraverser().preOrderTraversal(fullTextFile)) {
145 if (!file.isFile()) {
146 continue;
147 }
148 if (file.getName().startsWith(".")) {
149 continue;
150 }
151 if (!file.getName().endsWith(".xml")) {
152 continue;
153 }
154
155 LOGGER.info("File: {}", file.getName());
156
157 Document doc = dBuilder.parse(file);
158
159 for (Element sentenceElement : JOOX.$(doc).find("sentence")) {
160 String sentenceID = sentenceElement.getAttribute("ID");
161
162 TreeMap<Integer, String> pos = new TreeMap<>();
163 Element textElement = JOOX.$(sentenceElement).find("text").get(0);
164 String text = textElement.getTextContent();
165 LOGGER.trace(text);
166 StringBuffer stringBuffer;
167
168 List<FrameInformation> frames = new ArrayList<>();
169
170 for (Element annotationSet : JOOX.$(sentenceElement).find("annotationSet")) {
171 String luName = annotationSet.getAttribute("luName");
172 if (luName == null || luName.trim().length() == 0) {
173 for (Element layer : JOOX.$(annotationSet).find("layer")) {
174 if (layer.getAttribute("name").equals("PENN")) {
175 for (Element label : JOOX.$(layer).find("label")) {
176 Integer start = Integer.parseInt(label.getAttribute("start"));
177 String thisPos = label.getAttribute("name");
178 pos.put(start, thisPos);
179 }
180 }
181 }
182 } else {
183 FrameInformation frameInformation = new FrameInformation(
184 annotationSet.getAttribute("luName"), annotationSet.getAttribute("frameName"));
185 for (Element layer : JOOX.$(annotationSet).find("layer")) {
186 if (layer.getAttribute("name").equals("Target")) {
187 for (Element label : JOOX.$(layer).find("label")) {
188 frameInformation.setTarget(new SpanInformation(label.getAttribute("name"),
189 Integer.parseInt(label.getAttribute("start")),
190 Integer.parseInt(label.getAttribute("end"))
191 )
192 );
193 }
194 }
195 if (layer.getAttribute("name").equals("FE")) {
196 for (Element label : JOOX.$(layer).find("label")) {
197 String start = label.getAttribute("start");
198 String end = label.getAttribute("end");
199
200 if (start.length() > 0 && end.length() > 0) {
201 frameInformation.addRole(new SpanInformation(label.getAttribute("name"),
202 Integer.parseInt(start),
203 Integer.parseInt(end)
204 )
205 );
206 }
207 }
208 }
209 }
210
211 frames.add(frameInformation);
212 }
213 }
214
215 stringBuffer = new StringBuffer();
216 for (Integer key : pos.keySet()) {
217 String value = pos.get(key);
218 stringBuffer.append(value).append(" ");
219 }
220
221 Annotation s;
222 props.setProperty("fake_pos.pos", stringBuffer.toString().trim());
223 try {
224 pipeline = new StanfordCoreNLP(props);
225 s = new Annotation(text);
226 pipeline.annotate(s);
227 } catch (Throwable e) {
228 LOGGER.warn("Skipped sentence {}:{}", file.getName(), sentenceID);
229 continue;
230 }
231
232 sentNo++;
233 int size = s.get(CoreAnnotations.TokensAnnotation.class).size();
234
235 String[] tokens = new String[size];
236 String[] poss = new String[size];
237 String[] depLabels = new String[size];
238 String[] depParents = new String[size];
239 String[] lemmas = new String[size];
240
241 TreeMap<Integer, Integer> ids = new TreeMap<>();
242 for (CoreMap sentence : s.get(CoreAnnotations.SentencesAnnotation.class)) {
243
244 DepParseInfo info;
245
246 if (parseAnnotator.equals("parse")) {
247 SemanticGraph dependencies = sentence.get(
248 SemanticGraphCoreAnnotations.BasicDependenciesAnnotation.class);
249 info = new DepParseInfo(dependencies);
250 } else {
251 info = sentence.get(DepparseAnnotations.MstParserAnnotation.class);
252 }
253
254 for (Integer tokenID : info.getDepParents().keySet()) {
255 int parent = info.getDepParents().get(tokenID);
256 depParents[tokenID - 1] = Integer.toString(parent);
257 }
258 for (Integer tokenID : info.getDepLabels().keySet()) {
259 depLabels[tokenID - 1] = info.getDepLabels().get(tokenID);
260 }
261
262 java.util.List<CoreLabel> get = sentence.get(CoreAnnotations.TokensAnnotation.class);
263 for (int i = 0; i < get.size(); i++) {
264 CoreLabel token = get.get(i);
265 tokens[i] = token.get(CoreAnnotations.TextAnnotation.class);
266 poss[i] = token.get(CoreAnnotations.PartOfSpeechAnnotation.class);
267 lemmas[i] = token.get(CoreAnnotations.LemmaAnnotation.class);
268
269 ids.put(token.beginPosition(), i);
270 }
271 }
272
273 for (FrameInformation frame : frames) {
274 StringBuffer frameBuffer = new StringBuffer();
275
276 try {
277 int numFrameRoles = 1 + frame.roles.size();
278 frameBuffer.append(numFrameRoles);
279 frameBuffer.append("\t").append(frame.frameName);
280 frameBuffer.append("\t").append(frame.luName);
281
282 String interval = getInterval(frame.target, ids);
283 frameBuffer.append("\t").append(interval);
284
285 StringBuffer partsBuffer = new StringBuffer();
286 String[] parts = interval.split("_+");
287 for (String stringID : parts) {
288 partsBuffer.append("_").append(tokens[Integer.parseInt(stringID)]);
289 }
290 frameBuffer.append("\t").append(partsBuffer.toString().substring(1));
291 frameBuffer.append("\t").append(sentNo);
292 for (SpanInformation role : frame.roles) {
293 frameBuffer.append("\t").append(role.label);
294 frameBuffer.append("\t").append(getInterval(role, ids, true));
295 }
296
297 frameBuffer.append("\n");
298 } catch (Exception e) {
299 LOGGER.warn("Skipped frame: {}", frame.frameName);
300 continue;
301 }
302
303 writerFrames.append(frameBuffer);
304 }
305
306 stringBuffer = new StringBuffer();
307 stringBuffer.append(size);
308 for (String value : tokens) {
309 stringBuffer.append("\t").append(value);
310 }
311 for (String value : poss) {
312 stringBuffer.append("\t").append(value);
313 }
314 for (String value : depLabels) {
315 stringBuffer.append("\t").append(value);
316 }
317 for (String value : depParents) {
318 stringBuffer.append("\t").append(value);
319 }
320 for (int i = 0; i < tokens.length; i++) {
321 stringBuffer.append("\t").append("0");
322 }
323 for (String value : lemmas) {
324 stringBuffer.append("\t").append(value);
325 }
326
327 stringBuffer.append("\n");
328
329 writerLemmas.append(stringBuffer.toString());
330
331 }
332 }
333
334 writerLemmas.close();
335 writerFrames.close();
336
337 } catch (Throwable e) {
338 e.printStackTrace();
339 }
340 }
341
342 private static String getInterval(
343 SpanInformation span,
344 TreeMap<Integer, Integer> ids) {
345 return getInterval(span, ids, false);
346 }
347
348 private static String getInterval(
349 SpanInformation span,
350 TreeMap<Integer, Integer> ids,
351 boolean forRole) {
352
353 StringBuffer list = new StringBuffer();
354 for (Integer key : ids.keySet()) {
355 if (key >= span.start && key <= span.end) {
356 list.append("_").append(ids.get(key));
357 }
358 }
359
360 if (list.toString().length() == 0) {
361 return "";
362 }
363
364 if (!forRole) {
365 return list.toString().substring(1);
366 }
367
368 String[] parts = list.toString().substring(1).split("_+");
369 if (parts.length == 1) {
370 return parts[0];
371 }
372
373 return parts[0] + ":" + parts[parts.length - 1];
374 }
375 }