1   package eu.fbk.dkm.pikes.tintop.util.framenet;
2   
3   import ch.qos.logback.classic.Level;
4   import com.google.common.io.Files;
5   import edu.stanford.nlp.ling.CoreAnnotations;
6   import edu.stanford.nlp.ling.CoreLabel;
7   import edu.stanford.nlp.pipeline.Annotation;
8   import edu.stanford.nlp.pipeline.StanfordCoreNLP;
9   import edu.stanford.nlp.semgraph.SemanticGraph;
10  import edu.stanford.nlp.semgraph.SemanticGraphCoreAnnotations;
11  import edu.stanford.nlp.util.CoreMap;
12  import eu.fbk.dkm.pikes.depparseannotation.DepParseInfo;
13  import eu.fbk.dkm.pikes.depparseannotation.DepparseAnnotations;
14  import org.joox.JOOX;
15  import org.slf4j.Logger;
16  import org.slf4j.LoggerFactory;
17  import org.w3c.dom.Document;
18  import org.w3c.dom.Element;
19  
20  import javax.xml.parsers.DocumentBuilder;
21  import javax.xml.parsers.DocumentBuilderFactory;
22  import java.io.BufferedWriter;
23  import java.io.File;
24  import java.io.FileWriter;
25  import java.util.ArrayList;
26  import java.util.List;
27  import java.util.Properties;
28  import java.util.TreeMap;
29  
30  /**
31   * Created by alessio on 21/12/15.
32   */
33  
34  public class ParseFullTextWithStanford {
35  
36      private static final Logger LOGGER = LoggerFactory.getLogger(ParseFullTextWithStanford.class);
37      private static String annotatorsPattern = "tokenize, ssplit, %s, lemma, %s";
38  
39      /*
40      Todo: use pos from Stanford, as POS from FrameNet is not consistent
41       */
42  
43      static class SpanInformation {
44  
45          String label;
46          int start, end;
47  
48          public SpanInformation(String label, int start, int end) {
49              this.label = label;
50              this.start = start;
51              this.end = end;
52          }
53  
54          @Override public String toString() {
55              return "SpanInformation{" +
56                      "label='" + label + '\'' +
57                      ", start=" + start +
58                      ", end=" + end +
59                      '}';
60          }
61      }
62  
63      static class FrameInformation {
64  
65          SpanInformation target;
66          List<SpanInformation> roles = new ArrayList<>();
67          String luName, frameName;
68  
69          public FrameInformation(String luName, String frameName) {
70              this.luName = luName;
71              this.frameName = frameName;
72          }
73  
74          public void setTarget(SpanInformation target) {
75              this.target = target;
76          }
77  
78          public void addRole(SpanInformation role) {
79              this.roles.add(role);
80          }
81  
82          @Override public String toString() {
83              return "FrameInformation{" +
84                      "target=" + target +
85                      ", roles=" + roles +
86                      ", luName='" + luName + '\'' +
87                      ", frameName='" + frameName + '\'' +
88                      '}';
89          }
90      }
91  
92      /**
93       * Parse examples in FrameNet lus files and save them in Semafor format.
94       * <p>
95       * Arguments:
96       * - FrameNet LUs folder
97       * - Parsing output file (semafor.all.lemma.tags)
98       * - Frames output file (semafor.frame.elements)
99       * - POS (original vs. Stanford): fake_pos, pos
100      *
101      * @param args List of arguments (see above)
102      */
103     public static void main(String[] args) {
104 
105         if (args.length < 5) {
106             LOGGER.error("Five arguments needed: "
107                     + "fullTextPath, outputFile (parsing), outputFile (frames), posAnnotator, parseAnnotator");
108         }
109 
110         String fullTextPath = args[0];
111         String outputFile1 = args[1];
112         String outputFile2 = args[2];
113         String posAnnotator = args[3];
114         String parseAnnotator = args[4];
115 
116         String annotators = String.format(annotatorsPattern, posAnnotator, parseAnnotator);
117 
118         try {
119 
120             ((ch.qos.logback.classic.Logger) LoggerFactory.getLogger("edu.stanford")).setLevel(Level.ERROR);
121 
122             File fullTextFile = new File(fullTextPath);
123             BufferedWriter writerLemmas = new BufferedWriter(new FileWriter(outputFile1));
124             BufferedWriter writerFrames = new BufferedWriter(new FileWriter(outputFile2));
125 
126             Properties props = new Properties();
127             props.setProperty("annotators", annotators);
128             props.setProperty("customAnnotatorClass.fake_pos", "eu.fbk.fcw.utils.annotators.FakePosAnnotator");
129             props.setProperty("customAnnotatorClass.mst_server",
130                     "eu.fbk.fcw.mst.api.MstServerParserAnnotator");
131 
132             props.setProperty("tokenize.whitespace", "true");
133             props.setProperty("ssplit.eolonly", "true");
134 
135             props.setProperty("mst_server.host", "localhost");
136             props.setProperty("mst_server.port", "8012");
137 
138             StanfordCoreNLP pipeline = new StanfordCoreNLP(props);
139 
140             DocumentBuilderFactory dbFactory = DocumentBuilderFactory.newInstance();
141             DocumentBuilder dBuilder = dbFactory.newDocumentBuilder();
142 
143             int sentNo = -1;
144             for (final File file : Files.fileTreeTraverser().preOrderTraversal(fullTextFile)) {
145                 if (!file.isFile()) {
146                     continue;
147                 }
148                 if (file.getName().startsWith(".")) {
149                     continue;
150                 }
151                 if (!file.getName().endsWith(".xml")) {
152                     continue;
153                 }
154 
155                 LOGGER.info("File: {}", file.getName());
156 
157                 Document doc = dBuilder.parse(file);
158 
159                 for (Element sentenceElement : JOOX.$(doc).find("sentence")) {
160                     String sentenceID = sentenceElement.getAttribute("ID");
161 
162                     TreeMap<Integer, String> pos = new TreeMap<>();
163                     Element textElement = JOOX.$(sentenceElement).find("text").get(0);
164                     String text = textElement.getTextContent();
165                     LOGGER.trace(text);
166                     StringBuffer stringBuffer;
167 
168                     List<FrameInformation> frames = new ArrayList<>();
169 
170                     for (Element annotationSet : JOOX.$(sentenceElement).find("annotationSet")) {
171                         String luName = annotationSet.getAttribute("luName");
172                         if (luName == null || luName.trim().length() == 0) {
173                             for (Element layer : JOOX.$(annotationSet).find("layer")) {
174                                 if (layer.getAttribute("name").equals("PENN")) {
175                                     for (Element label : JOOX.$(layer).find("label")) {
176                                         Integer start = Integer.parseInt(label.getAttribute("start"));
177                                         String thisPos = label.getAttribute("name");
178                                         pos.put(start, thisPos);
179                                     }
180                                 }
181                             }
182                         } else {
183                             FrameInformation frameInformation = new FrameInformation(
184                                     annotationSet.getAttribute("luName"), annotationSet.getAttribute("frameName"));
185                             for (Element layer : JOOX.$(annotationSet).find("layer")) {
186                                 if (layer.getAttribute("name").equals("Target")) {
187                                     for (Element label : JOOX.$(layer).find("label")) {
188                                         frameInformation.setTarget(new SpanInformation(label.getAttribute("name"),
189                                                         Integer.parseInt(label.getAttribute("start")),
190                                                         Integer.parseInt(label.getAttribute("end"))
191                                                 )
192                                         );
193                                     }
194                                 }
195                                 if (layer.getAttribute("name").equals("FE")) {
196                                     for (Element label : JOOX.$(layer).find("label")) {
197                                         String start = label.getAttribute("start");
198                                         String end = label.getAttribute("end");
199 
200                                         if (start.length() > 0 && end.length() > 0) {
201                                             frameInformation.addRole(new SpanInformation(label.getAttribute("name"),
202                                                             Integer.parseInt(start),
203                                                             Integer.parseInt(end)
204                                                     )
205                                             );
206                                         }
207                                     }
208                                 }
209                             }
210 
211                             frames.add(frameInformation);
212                         }
213                     }
214 
215                     stringBuffer = new StringBuffer();
216                     for (Integer key : pos.keySet()) {
217                         String value = pos.get(key);
218                         stringBuffer.append(value).append(" ");
219                     }
220 
221                     Annotation s;
222                     props.setProperty("fake_pos.pos", stringBuffer.toString().trim());
223                     try {
224                         pipeline = new StanfordCoreNLP(props);
225                         s = new Annotation(text);
226                         pipeline.annotate(s);
227                     } catch (Throwable e) {
228                         LOGGER.warn("Skipped sentence {}:{}", file.getName(), sentenceID);
229                         continue;
230                     }
231 
232                     sentNo++;
233                     int size = s.get(CoreAnnotations.TokensAnnotation.class).size();
234 
235                     String[] tokens = new String[size];
236                     String[] poss = new String[size];
237                     String[] depLabels = new String[size];
238                     String[] depParents = new String[size];
239                     String[] lemmas = new String[size];
240 
241                     TreeMap<Integer, Integer> ids = new TreeMap<>();
242                     for (CoreMap sentence : s.get(CoreAnnotations.SentencesAnnotation.class)) {
243 
244                         DepParseInfo info;
245 
246                         if (parseAnnotator.equals("parse")) {
247                             SemanticGraph dependencies = sentence.get(
248                                     SemanticGraphCoreAnnotations.BasicDependenciesAnnotation.class);
249                             info = new DepParseInfo(dependencies);
250                         } else {
251                             info = sentence.get(DepparseAnnotations.MstParserAnnotation.class);
252                         }
253 
254                         for (Integer tokenID : info.getDepParents().keySet()) {
255                             int parent = info.getDepParents().get(tokenID);
256                             depParents[tokenID - 1] = Integer.toString(parent);
257                         }
258                         for (Integer tokenID : info.getDepLabels().keySet()) {
259                             depLabels[tokenID - 1] = info.getDepLabels().get(tokenID);
260                         }
261 
262                         java.util.List<CoreLabel> get = sentence.get(CoreAnnotations.TokensAnnotation.class);
263                         for (int i = 0; i < get.size(); i++) {
264                             CoreLabel token = get.get(i);
265                             tokens[i] = token.get(CoreAnnotations.TextAnnotation.class);
266                             poss[i] = token.get(CoreAnnotations.PartOfSpeechAnnotation.class);
267                             lemmas[i] = token.get(CoreAnnotations.LemmaAnnotation.class);
268 
269                             ids.put(token.beginPosition(), i);
270                         }
271                     }
272 
273                     for (FrameInformation frame : frames) {
274                         StringBuffer frameBuffer = new StringBuffer();
275 
276                         try {
277                             int numFrameRoles = 1 + frame.roles.size();
278                             frameBuffer.append(numFrameRoles);
279                             frameBuffer.append("\t").append(frame.frameName);
280                             frameBuffer.append("\t").append(frame.luName);
281 
282                             String interval = getInterval(frame.target, ids);
283                             frameBuffer.append("\t").append(interval);
284 
285                             StringBuffer partsBuffer = new StringBuffer();
286                             String[] parts = interval.split("_+");
287                             for (String stringID : parts) {
288                                 partsBuffer.append("_").append(tokens[Integer.parseInt(stringID)]);
289                             }
290                             frameBuffer.append("\t").append(partsBuffer.toString().substring(1));
291                             frameBuffer.append("\t").append(sentNo);
292                             for (SpanInformation role : frame.roles) {
293                                 frameBuffer.append("\t").append(role.label);
294                                 frameBuffer.append("\t").append(getInterval(role, ids, true));
295                             }
296 
297                             frameBuffer.append("\n");
298                         } catch (Exception e) {
299                             LOGGER.warn("Skipped frame: {}", frame.frameName);
300                             continue;
301                         }
302 
303                         writerFrames.append(frameBuffer);
304                     }
305 
306                     stringBuffer = new StringBuffer();
307                     stringBuffer.append(size);
308                     for (String value : tokens) {
309                         stringBuffer.append("\t").append(value);
310                     }
311                     for (String value : poss) {
312                         stringBuffer.append("\t").append(value);
313                     }
314                     for (String value : depLabels) {
315                         stringBuffer.append("\t").append(value);
316                     }
317                     for (String value : depParents) {
318                         stringBuffer.append("\t").append(value);
319                     }
320                     for (int i = 0; i < tokens.length; i++) {
321                         stringBuffer.append("\t").append("0");
322                     }
323                     for (String value : lemmas) {
324                         stringBuffer.append("\t").append(value);
325                     }
326 
327                     stringBuffer.append("\n");
328 
329                     writerLemmas.append(stringBuffer.toString());
330 
331                 }
332             }
333 
334             writerLemmas.close();
335             writerFrames.close();
336 
337         } catch (Throwable e) {
338             e.printStackTrace();
339         }
340     }
341 
342     private static String getInterval(
343             SpanInformation span,
344             TreeMap<Integer, Integer> ids) {
345         return getInterval(span, ids, false);
346     }
347 
348     private static String getInterval(
349             SpanInformation span,
350             TreeMap<Integer, Integer> ids,
351             boolean forRole) {
352 
353         StringBuffer list = new StringBuffer();
354         for (Integer key : ids.keySet()) {
355             if (key >= span.start && key <= span.end) {
356                 list.append("_").append(ids.get(key));
357             }
358         }
359 
360         if (list.toString().length() == 0) {
361             return "";
362         }
363 
364         if (!forRole) {
365             return list.toString().substring(1);
366         }
367 
368         String[] parts = list.toString().substring(1).split("_+");
369         if (parts.length == 1) {
370             return parts[0];
371         }
372 
373         return parts[0] + ":" + parts[parts.length - 1];
374     }
375 }