1   package eu.fbk.dkm.pikes.tintop.util;
2   
3   import edu.stanford.nlp.ling.CoreAnnotations;
4   import edu.stanford.nlp.ling.CoreLabel;
5   import edu.stanford.nlp.pipeline.Annotation;
6   import edu.stanford.nlp.pipeline.StanfordCoreNLP;
7   import edu.stanford.nlp.util.CoreMap;
8   import eu.fbk.utils.core.IO;
9   import org.slf4j.Logger;
10  import org.slf4j.LoggerFactory;
11  
12  import java.io.*;
13  import java.util.List;
14  import java.util.Properties;
15  
16  /**
17   * Created by alessio on 24/09/16.
18   */
19  
20  public class TextToCat {
21  
22      private static final Logger LOGGER = LoggerFactory.getLogger(TextToCat.class);
23  
24      public static void main(String[] args) throws IOException {
25          String inputFolder = args[0];
26          String outputFolder = args[1];
27  
28          File inputFile = new File(inputFolder);
29          File outputFile = new File(outputFolder);
30  
31          if (!inputFile.exists()) {
32              LOGGER.error("Folder {} does not exist", inputFolder);
33              System.exit(1);
34          }
35          if (!inputFile.isDirectory()) {
36              LOGGER.error("Folder {} is not a valid folder", inputFolder);
37              System.exit(1);
38          }
39          if (!outputFile.exists()) {
40              if (!outputFile.mkdirs()) {
41                  LOGGER.error("Unable to create folder {}", outputFolder);
42                  System.exit(1);
43              }
44          } else {
45              if (outputFile.isFile()) {
46                  LOGGER.error("Folder {} is a file", outputFolder);
47                  System.exit(1);
48              }
49          }
50  
51          Properties properties = new Properties();
52          properties.setProperty("annotators", "tokenize, ssplit");
53          properties.setProperty("ssplit.newlineIsSentenceBreak", "always");
54  
55          StanfordCoreNLP pipeline = new StanfordCoreNLP(properties);
56  
57          for (File file : inputFile.listFiles()) {
58              InputStream stream = IO.read(file.getAbsolutePath());
59              Reader reader = new InputStreamReader(stream);
60              StringBuilder inputText = new StringBuilder();
61              int i;
62              while ((i = reader.read()) != -1) {
63                  inputText.append((char) i);
64              }
65              reader.close();
66              String text = inputText.toString();
67  
68              Annotation document = new Annotation(text);
69              pipeline.annotate(document);
70  
71              File output = new File(outputFile.getAbsolutePath() + File.separator + file.getName());
72              OutputStream write = IO.write(output.getAbsolutePath());
73              BufferedWriter writer = new BufferedWriter(new OutputStreamWriter(write));
74  
75              List<CoreMap> sentences = document.get(CoreAnnotations.SentencesAnnotation.class);
76              for (CoreMap sentence : sentences) {
77                  List<CoreLabel> tokens = sentence.get(CoreAnnotations.TokensAnnotation.class);
78                  for (CoreLabel token : tokens) {
79                      writer.append(token.originalText()).append("\n");
80                  }
81                  writer.append("<eos>\n");
82              }
83  
84              writer.close();
85              write.close();
86  
87  //            System.out.println(file.getName());
88  //            System.out.println(text);
89  //            System.out.println();
90          }
91  
92      }
93  }