1   package eu.fbk.dkm.pikes.tintop.util;
2   
3   import ch.qos.logback.classic.Level;
4   import edu.stanford.nlp.ling.CoreAnnotations;
5   import edu.stanford.nlp.pipeline.Annotation;
6   import edu.stanford.nlp.pipeline.StanfordCoreNLP;
7   import edu.stanford.nlp.semgraph.SemanticGraph;
8   import edu.stanford.nlp.semgraph.SemanticGraphFactory;
9   import edu.stanford.nlp.trees.*;
10  import edu.stanford.nlp.util.CoreMap;
11  import edu.stanford.nlp.util.Filters;
12  import eu.fbk.fcw.utils.corpus.Corpus;
13  import eu.fbk.fcw.utils.corpus.Sentence;
14  import eu.fbk.fcw.utils.corpus.Word;
15  import eu.fbk.dkm.pikes.depparseannotation.DepParseInfo;
16  import eu.fbk.utils.core.CommandLine;
17  import org.slf4j.Logger;
18  import org.slf4j.LoggerFactory;
19  
20  import java.io.BufferedWriter;
21  import java.io.File;
22  import java.io.FileWriter;
23  import java.io.IOException;
24  import java.util.Properties;
25  import java.util.concurrent.atomic.AtomicInteger;
26  
27  /**
28   * Created by alessio on 26/02/15.
29   */
30  
31  public class ReparseConllStanford {
32  
33      private static final Logger LOGGER = LoggerFactory.getLogger(ReparseConllStanford.class);
34  
35      public static void main(String[] args) {
36  
37          try {
38              final eu.fbk.utils.core.CommandLine cmd = eu.fbk.utils.core.CommandLine
39                      .parser()
40                      .withName("./reparse-conll")
41                      .withHeader(
42                              "Parse a document in CoNLL format with Stanford Parser, then save it in CoNLL format again")
43                      .withOption("i", "input", "Input file", "FILE",
44                              CommandLine.Type.FILE_EXISTING, true, false, true)
45                      .withOption("o", "output", "Output file", "FILE",
46                              CommandLine.Type.FILE, true, false, true)
47                      .withOption("k", "keep-loops", "Keep loops (by default they will be removed)")
48                      .withLogger(LoggerFactory.getLogger("eu.fbk")).parse(args);
49              ((ch.qos.logback.classic.Logger) LoggerFactory.getLogger("edu.stanford")).setLevel(Level.ERROR);
50  
51              File inputFile = cmd.getOptionValue("input", File.class);
52              File outputFile = cmd.getOptionValue("output", File.class);
53  
54              boolean keepLoops = cmd.hasOption("keep-loops");
55  
56              BufferedWriter writer = new BufferedWriter(new FileWriter(outputFile));
57  
58              Properties stanfordProps = new Properties();
59              stanfordProps.setProperty("annotators", "tokenize, ssplit, pos, parse");
60              stanfordProps.setProperty("tokenize.whitespace", "true");
61              stanfordProps.setProperty("ssplit.eolonly", "true");
62              stanfordProps.setProperty("parse.keepPunct", "true");
63  
64              Corpus conll2009 = Corpus.readDocumentFromFile(inputFile, "conll2009");
65              AtomicInteger removedSentences = new AtomicInteger(0);
66              AtomicInteger totalSentences = new AtomicInteger(0);
67  
68              conll2009.getSentences().parallelStream().forEach((Sentence sentence) -> {
69                  totalSentences.incrementAndGet();
70                  StanfordCoreNLP pipeline = new StanfordCoreNLP(stanfordProps);
71  
72                  StringBuilder stanfordSentenceBuilder = new StringBuilder();
73  
74                  for (Word word : sentence) {
75                      stanfordSentenceBuilder.append(" ").append(word.getForm().replaceAll("\\s+", "_"));
76                  }
77  
78                  String stanfordSentence = stanfordSentenceBuilder.toString().trim();
79  
80                  Annotation annotation = new Annotation(stanfordSentence);
81                  pipeline.annotate(annotation);
82  
83                  CoreMap coreMap = annotation.get(CoreAnnotations.SentencesAnnotation.class).get(0);
84  
85                  Tree tree = coreMap.get(TreeCoreAnnotations.TreeAnnotation.class);
86                  GrammaticalStructure grammaticalStructure = new EnglishGrammaticalStructure(tree,
87                          Filters.acceptFilter(), new CollinsHeadFinder());
88                  SemanticGraph dependencies = SemanticGraphFactory.makeFromTree(grammaticalStructure);
89  //                SemanticGraph dependencies = SemanticGraphFactory
90  //                        .makeFromTree(grammaticalStructure, SemanticGraphFactory.Mode.BASIC,
91  //                                GrammaticalStructure.Extras.NONE, true, null);
92                  DepParseInfo info = new DepParseInfo(dependencies);
93  
94                  for (Integer id : info.getDepParents().keySet()) {
95                      sentence.getWords().get(id - 1).setDepParent(info.getDepParents().get(id));
96                  }
97                  for (Integer id : info.getDepLabels().keySet()) {
98                      sentence.getWords().get(id - 1).setDepLabel(info.getDepLabels().get(id));
99                  }
100 
101                 boolean writeIt = true;
102                 if (!keepLoops) {
103                     writeIt = RemoveLoopsInConll.sentenceIsLoopFree(sentence);
104                 }
105 
106                 if (writeIt) {
107                     synchronized (writer) {
108                         try {
109                             writer.append(sentence.toConllString());
110                         } catch (IOException e) {
111                             e.printStackTrace();
112                         }
113                     }
114                 }
115                 else {
116                     removedSentences.incrementAndGet();
117                 }
118             });
119 
120             LOGGER.info("Total sentences: {}", totalSentences);
121             LOGGER.info("Removed sentences: {}", removedSentences);
122 
123             writer.close();
124 
125         } catch (Exception e) {
126             CommandLine.fail(e);
127         }
128     }
129 }