1 package eu.fbk.dkm.pikes.tintop.util;
2
3 import eu.fbk.fcw.utils.corpus.Corpus;
4 import eu.fbk.fcw.utils.corpus.Sentence;
5 import eu.fbk.fcw.utils.corpus.Word;
6 import org.slf4j.Logger;
7 import org.slf4j.LoggerFactory;
8
9 import java.io.BufferedWriter;
10 import java.io.FileWriter;
11 import java.io.IOException;
12 import java.util.List;
13 import java.util.concurrent.atomic.AtomicInteger;
14
15
16
17
18
19 public class RemoveLoopsInConll {
20
21 private static final Logger LOGGER = LoggerFactory.getLogger(RemoveLoopsInConll.class);
22
23 public static boolean sentenceIsLoopFree(Sentence sentence) {
24 java.util.List<Word> words = sentence.getWords();
25 for (int i = 0; i < words.size(); i++) {
26 List<Integer> ancestors = sentence.getAncestors(i);
27 int size = ancestors.size();
28 if (size > sentence.getWords().size()) {
29 return false;
30 }
31 }
32
33 return true;
34 }
35
36 public static void removeLoops(String inputFile, String outputFile) throws IOException {
37 Corpus conll2009 = Corpus.readDocumentFromFile(inputFile, "conll2009");
38 BufferedWriter writer = new BufferedWriter(new FileWriter(outputFile));
39
40 AtomicInteger removedSentences = new AtomicInteger(0);
41 AtomicInteger totalSentences = new AtomicInteger(0);
42
43 conll2009.getSentences().parallelStream().forEach((Sentence sentence) -> {
44 totalSentences.incrementAndGet();
45 boolean loopFree = sentenceIsLoopFree(sentence);
46
47 if (loopFree) {
48 synchronized (writer) {
49 try {
50 writer.append(sentence.toConllString());
51 } catch (IOException e) {
52 e.printStackTrace();
53 }
54 }
55 }
56 else {
57 removedSentences.incrementAndGet();
58 }
59 });
60
61 LOGGER.info("Removed {} sentences out of {}", removedSentences, totalSentences);
62
63 writer.close();
64 }
65
66 public static void main(String[] args) {
67 String inputFile = args[0];
68 String outputFile = args[1];
69
70 try {
71 removeLoops(inputFile, outputFile);
72 } catch (Exception e) {
73 LOGGER.error(e.getMessage());
74 }
75 }
76 }