1   package eu.fbk.dkm.pikes.resources.boxer;
2   
3   import eu.fbk.rdfpro.util.Statements;
4   import eu.fbk.utils.core.CommandLine;
5   import ixa.kaflib.KAFDocument;
6   import org.slf4j.LoggerFactory;
7   
8   import java.io.BufferedReader;
9   import java.io.File;
10  import java.io.FileReader;
11  import java.util.ArrayList;
12  
13  /**
14   * Created by alessio on 05/05/15.
15   */
16  
17  public class CorpusSplitter {
18  
19  	private static final org.slf4j.Logger LOGGER = LoggerFactory.getLogger(CorpusSplitter.class);
20  	public static final Integer sentencesPerCluster = 50;
21  	private static final String NAMESPACE = "http://www.newsreader-project.eu/eu.fbk.dkm.pikes.resources.boxer/";
22  
23  	private static void createDocument(ArrayList<String> list, File folder, Integer index) {
24  
25  		StringBuffer buffer = new StringBuffer();
26  		for (String line:list) {
27  			buffer.append(line);
28  			buffer.append("\n");
29  		}
30  
31  		String text = buffer.toString();
32  		String nafFileName = index + ".naf";
33  		File nafFile = new File(folder.getAbsolutePath() + File.separator + nafFileName);
34  		String documentURI = NAMESPACE + nafFileName;
35  
36  		final KAFDocument document = new KAFDocument("en", "v3");
37  		document.setRawText(text);
38  		document.createPublic();
39  		document.getPublic().publicId = Statements.VALUE_FACTORY.createIRI(documentURI).getLocalName();
40  		document.getPublic().uri = documentURI;
41  		document.createFileDesc();
42  		document.getFileDesc().filename = nafFileName;
43  		document.getFileDesc().title = "-";
44  		document.save(nafFile.getAbsolutePath());
45  
46  	}
47  
48  	public static void main(String[] args) {
49  		try {
50  			final CommandLine cmd = CommandLine
51  					.parser()
52  					.withName("eu.fbk.dkm.pikes.resources.darmstadt-loader")
53  					.withHeader("Load Boxer corpus and split it")
54  					.withOption("i", "input-file", "corpus file", "DIR", CommandLine.Type.FILE_EXISTING, true, false, true)
55  					.withOption("o", "output-folder", "output folder", "DIR", CommandLine.Type.DIRECTORY, true, false, true)
56  //					.withOption("f", "force", "Force opinion")
57  					.withLogger(LoggerFactory.getLogger("eu.fbk.fssa")).parse(args);
58  
59  			final File inputFile = cmd.getOptionValue("i", File.class);
60  			final File outputFolder = cmd.getOptionValue("o", File.class);
61  
62  			if (!outputFolder.exists()) {
63  				outputFolder.mkdirs();
64  			}
65  
66  			BufferedReader reader = new BufferedReader(new FileReader(inputFile));
67  			ArrayList<String> list = new ArrayList<>();
68  			String line;
69  
70  			int index = 0;
71  
72  			while ((line = reader.readLine()) != null) {
73  				index++;
74  				line = line.trim();
75  				list.add(line);
76  				if (list.size() >= sentencesPerCluster) {
77  					createDocument(list, outputFolder, index);
78  					list = new ArrayList<>();
79  				}
80  			}
81  			if (list.size() > 0) {
82  				createDocument(list, outputFolder, index);
83  			}
84  			reader.close();
85  
86  		} catch (final Throwable ex) {
87  			CommandLine.fail(ex);
88  		}
89  
90  	}
91  
92  }