1 package eu.fbk.dkm.pikes.resources.boxer;
2
3 import eu.fbk.rdfpro.util.Statements;
4 import eu.fbk.utils.core.CommandLine;
5 import ixa.kaflib.KAFDocument;
6 import org.slf4j.LoggerFactory;
7
8 import java.io.BufferedReader;
9 import java.io.File;
10 import java.io.FileReader;
11 import java.util.ArrayList;
12
13
14
15
16
17 public class CorpusSplitter {
18
19 private static final org.slf4j.Logger LOGGER = LoggerFactory.getLogger(CorpusSplitter.class);
20 public static final Integer sentencesPerCluster = 50;
21 private static final String NAMESPACE = "http://www.newsreader-project.eu/eu.fbk.dkm.pikes.resources.boxer/";
22
23 private static void createDocument(ArrayList<String> list, File folder, Integer index) {
24
25 StringBuffer buffer = new StringBuffer();
26 for (String line:list) {
27 buffer.append(line);
28 buffer.append("\n");
29 }
30
31 String text = buffer.toString();
32 String nafFileName = index + ".naf";
33 File nafFile = new File(folder.getAbsolutePath() + File.separator + nafFileName);
34 String documentURI = NAMESPACE + nafFileName;
35
36 final KAFDocument document = new KAFDocument("en", "v3");
37 document.setRawText(text);
38 document.createPublic();
39 document.getPublic().publicId = Statements.VALUE_FACTORY.createIRI(documentURI).getLocalName();
40 document.getPublic().uri = documentURI;
41 document.createFileDesc();
42 document.getFileDesc().filename = nafFileName;
43 document.getFileDesc().title = "-";
44 document.save(nafFile.getAbsolutePath());
45
46 }
47
48 public static void main(String[] args) {
49 try {
50 final CommandLine cmd = CommandLine
51 .parser()
52 .withName("eu.fbk.dkm.pikes.resources.darmstadt-loader")
53 .withHeader("Load Boxer corpus and split it")
54 .withOption("i", "input-file", "corpus file", "DIR", CommandLine.Type.FILE_EXISTING, true, false, true)
55 .withOption("o", "output-folder", "output folder", "DIR", CommandLine.Type.DIRECTORY, true, false, true)
56
57 .withLogger(LoggerFactory.getLogger("eu.fbk.fssa")).parse(args);
58
59 final File inputFile = cmd.getOptionValue("i", File.class);
60 final File outputFolder = cmd.getOptionValue("o", File.class);
61
62 if (!outputFolder.exists()) {
63 outputFolder.mkdirs();
64 }
65
66 BufferedReader reader = new BufferedReader(new FileReader(inputFile));
67 ArrayList<String> list = new ArrayList<>();
68 String line;
69
70 int index = 0;
71
72 while ((line = reader.readLine()) != null) {
73 index++;
74 line = line.trim();
75 list.add(line);
76 if (list.size() >= sentencesPerCluster) {
77 createDocument(list, outputFolder, index);
78 list = new ArrayList<>();
79 }
80 }
81 if (list.size() > 0) {
82 createDocument(list, outputFolder, index);
83 }
84 reader.close();
85
86 } catch (final Throwable ex) {
87 CommandLine.fail(ex);
88 }
89
90 }
91
92 }