1 package eu.fbk.dkm.pikes.tintop.util;
2
3 import ch.qos.logback.classic.Level;
4 import edu.stanford.nlp.ling.CoreAnnotations;
5 import edu.stanford.nlp.pipeline.Annotation;
6 import edu.stanford.nlp.pipeline.StanfordCoreNLP;
7 import edu.stanford.nlp.semgraph.SemanticGraph;
8 import edu.stanford.nlp.semgraph.SemanticGraphFactory;
9 import edu.stanford.nlp.trees.*;
10 import edu.stanford.nlp.util.CoreMap;
11 import edu.stanford.nlp.util.Filters;
12 import eu.fbk.fcw.utils.corpus.Corpus;
13 import eu.fbk.fcw.utils.corpus.Sentence;
14 import eu.fbk.fcw.utils.corpus.Word;
15 import eu.fbk.dkm.pikes.depparseannotation.DepParseInfo;
16 import eu.fbk.utils.core.CommandLine;
17 import org.slf4j.Logger;
18 import org.slf4j.LoggerFactory;
19
20 import java.io.BufferedWriter;
21 import java.io.File;
22 import java.io.FileWriter;
23 import java.io.IOException;
24 import java.util.Properties;
25 import java.util.concurrent.atomic.AtomicInteger;
26
27
28
29
30
31 public class ReparseConllStanford {
32
33 private static final Logger LOGGER = LoggerFactory.getLogger(ReparseConllStanford.class);
34
35 public static void main(String[] args) {
36
37 try {
38 final eu.fbk.utils.core.CommandLine cmd = eu.fbk.utils.core.CommandLine
39 .parser()
40 .withName("./reparse-conll")
41 .withHeader(
42 "Parse a document in CoNLL format with Stanford Parser, then save it in CoNLL format again")
43 .withOption("i", "input", "Input file", "FILE",
44 CommandLine.Type.FILE_EXISTING, true, false, true)
45 .withOption("o", "output", "Output file", "FILE",
46 CommandLine.Type.FILE, true, false, true)
47 .withOption("k", "keep-loops", "Keep loops (by default they will be removed)")
48 .withLogger(LoggerFactory.getLogger("eu.fbk")).parse(args);
49 ((ch.qos.logback.classic.Logger) LoggerFactory.getLogger("edu.stanford")).setLevel(Level.ERROR);
50
51 File inputFile = cmd.getOptionValue("input", File.class);
52 File outputFile = cmd.getOptionValue("output", File.class);
53
54 boolean keepLoops = cmd.hasOption("keep-loops");
55
56 BufferedWriter writer = new BufferedWriter(new FileWriter(outputFile));
57
58 Properties stanfordProps = new Properties();
59 stanfordProps.setProperty("annotators", "tokenize, ssplit, pos, parse");
60 stanfordProps.setProperty("tokenize.whitespace", "true");
61 stanfordProps.setProperty("ssplit.eolonly", "true");
62 stanfordProps.setProperty("parse.keepPunct", "true");
63
64 Corpus conll2009 = Corpus.readDocumentFromFile(inputFile, "conll2009");
65 AtomicInteger removedSentences = new AtomicInteger(0);
66 AtomicInteger totalSentences = new AtomicInteger(0);
67
68 conll2009.getSentences().parallelStream().forEach((Sentence sentence) -> {
69 totalSentences.incrementAndGet();
70 StanfordCoreNLP pipeline = new StanfordCoreNLP(stanfordProps);
71
72 StringBuilder stanfordSentenceBuilder = new StringBuilder();
73
74 for (Word word : sentence) {
75 stanfordSentenceBuilder.append(" ").append(word.getForm().replaceAll("\\s+", "_"));
76 }
77
78 String stanfordSentence = stanfordSentenceBuilder.toString().trim();
79
80 Annotation annotation = new Annotation(stanfordSentence);
81 pipeline.annotate(annotation);
82
83 CoreMap coreMap = annotation.get(CoreAnnotations.SentencesAnnotation.class).get(0);
84
85 Tree tree = coreMap.get(TreeCoreAnnotations.TreeAnnotation.class);
86 GrammaticalStructure grammaticalStructure = new EnglishGrammaticalStructure(tree,
87 Filters.acceptFilter(), new CollinsHeadFinder());
88 SemanticGraph dependencies = SemanticGraphFactory.makeFromTree(grammaticalStructure);
89
90
91
92 DepParseInfo info = new DepParseInfo(dependencies);
93
94 for (Integer id : info.getDepParents().keySet()) {
95 sentence.getWords().get(id - 1).setDepParent(info.getDepParents().get(id));
96 }
97 for (Integer id : info.getDepLabels().keySet()) {
98 sentence.getWords().get(id - 1).setDepLabel(info.getDepLabels().get(id));
99 }
100
101 boolean writeIt = true;
102 if (!keepLoops) {
103 writeIt = RemoveLoopsInConll.sentenceIsLoopFree(sentence);
104 }
105
106 if (writeIt) {
107 synchronized (writer) {
108 try {
109 writer.append(sentence.toConllString());
110 } catch (IOException e) {
111 e.printStackTrace();
112 }
113 }
114 }
115 else {
116 removedSentences.incrementAndGet();
117 }
118 });
119
120 LOGGER.info("Total sentences: {}", totalSentences);
121 LOGGER.info("Removed sentences: {}", removedSentences);
122
123 writer.close();
124
125 } catch (Exception e) {
126 CommandLine.fail(e);
127 }
128 }
129 }