1 package eu.fbk.dkm.pikes.tintop.util;
2
3 import edu.stanford.nlp.ling.CoreAnnotations;
4 import edu.stanford.nlp.ling.CoreLabel;
5 import edu.stanford.nlp.pipeline.Annotation;
6 import edu.stanford.nlp.pipeline.StanfordCoreNLP;
7 import edu.stanford.nlp.util.CoreMap;
8 import eu.fbk.utils.core.IO;
9 import org.slf4j.Logger;
10 import org.slf4j.LoggerFactory;
11
12 import java.io.*;
13 import java.util.List;
14 import java.util.Properties;
15
16
17
18
19
20 public class TextToCat {
21
22 private static final Logger LOGGER = LoggerFactory.getLogger(TextToCat.class);
23
24 public static void main(String[] args) throws IOException {
25 String inputFolder = args[0];
26 String outputFolder = args[1];
27
28 File inputFile = new File(inputFolder);
29 File outputFile = new File(outputFolder);
30
31 if (!inputFile.exists()) {
32 LOGGER.error("Folder {} does not exist", inputFolder);
33 System.exit(1);
34 }
35 if (!inputFile.isDirectory()) {
36 LOGGER.error("Folder {} is not a valid folder", inputFolder);
37 System.exit(1);
38 }
39 if (!outputFile.exists()) {
40 if (!outputFile.mkdirs()) {
41 LOGGER.error("Unable to create folder {}", outputFolder);
42 System.exit(1);
43 }
44 } else {
45 if (outputFile.isFile()) {
46 LOGGER.error("Folder {} is a file", outputFolder);
47 System.exit(1);
48 }
49 }
50
51 Properties properties = new Properties();
52 properties.setProperty("annotators", "tokenize, ssplit");
53 properties.setProperty("ssplit.newlineIsSentenceBreak", "always");
54
55 StanfordCoreNLP pipeline = new StanfordCoreNLP(properties);
56
57 for (File file : inputFile.listFiles()) {
58 InputStream stream = IO.read(file.getAbsolutePath());
59 Reader reader = new InputStreamReader(stream);
60 StringBuilder inputText = new StringBuilder();
61 int i;
62 while ((i = reader.read()) != -1) {
63 inputText.append((char) i);
64 }
65 reader.close();
66 String text = inputText.toString();
67
68 Annotation document = new Annotation(text);
69 pipeline.annotate(document);
70
71 File output = new File(outputFile.getAbsolutePath() + File.separator + file.getName());
72 OutputStream write = IO.write(output.getAbsolutePath());
73 BufferedWriter writer = new BufferedWriter(new OutputStreamWriter(write));
74
75 List<CoreMap> sentences = document.get(CoreAnnotations.SentencesAnnotation.class);
76 for (CoreMap sentence : sentences) {
77 List<CoreLabel> tokens = sentence.get(CoreAnnotations.TokensAnnotation.class);
78 for (CoreLabel token : tokens) {
79 writer.append(token.originalText()).append("\n");
80 }
81 writer.append("<eos>\n");
82 }
83
84 writer.close();
85 write.close();
86
87
88
89
90 }
91
92 }
93 }