1   package eu.fbk.dkm.pikes.tintop.util;
2   
3   import eu.fbk.fcw.utils.corpus.Corpus;
4   import eu.fbk.fcw.utils.corpus.Sentence;
5   import eu.fbk.fcw.utils.corpus.Word;
6   import org.slf4j.Logger;
7   import org.slf4j.LoggerFactory;
8   
9   import java.io.BufferedWriter;
10  import java.io.FileWriter;
11  import java.io.IOException;
12  import java.util.List;
13  import java.util.concurrent.atomic.AtomicInteger;
14  
15  /**
16   * Created by alessio on 28/12/15.
17   */
18  
19  public class RemoveLoopsInConll {
20  
21      private static final Logger LOGGER = LoggerFactory.getLogger(RemoveLoopsInConll.class);
22  
23      public static boolean sentenceIsLoopFree(Sentence sentence) {
24          java.util.List<Word> words = sentence.getWords();
25          for (int i = 0; i < words.size(); i++) {
26              List<Integer> ancestors = sentence.getAncestors(i);
27              int size = ancestors.size();
28              if (size > sentence.getWords().size()) {
29                  return false;
30              }
31          }
32  
33          return true;
34      }
35  
36      public static void removeLoops(String inputFile, String outputFile) throws IOException {
37          Corpus conll2009 = Corpus.readDocumentFromFile(inputFile, "conll2009");
38          BufferedWriter writer = new BufferedWriter(new FileWriter(outputFile));
39  
40          AtomicInteger removedSentences = new AtomicInteger(0);
41          AtomicInteger totalSentences = new AtomicInteger(0);
42  
43          conll2009.getSentences().parallelStream().forEach((Sentence sentence) -> {
44              totalSentences.incrementAndGet();
45              boolean loopFree = sentenceIsLoopFree(sentence);
46  
47              if (loopFree) {
48                  synchronized (writer) {
49                      try {
50                          writer.append(sentence.toConllString());
51                      } catch (IOException e) {
52                          e.printStackTrace();
53                      }
54                  }
55              }
56              else {
57                  removedSentences.incrementAndGet();
58              }
59          });
60  
61          LOGGER.info("Removed {} sentences out of {}", removedSentences, totalSentences);
62  
63          writer.close();
64      }
65  
66      public static void main(String[] args) {
67          String inputFile = args[0];
68          String outputFile = args[1];
69  
70          try {
71              removeLoops(inputFile, outputFile);
72          } catch (Exception e) {
73              LOGGER.error(e.getMessage());
74          }
75      }
76  }