1 package eu.fbk.dkm.pikes.resources.vuaopinion;
2
3 import eu.fbk.utils.core.CommandLine;
4 import ixa.kaflib.KAFDocument;
5 import ixa.kaflib.WF;
6 import org.apache.commons.io.FileUtils;
7 import org.apache.commons.io.FilenameUtils;
8 import org.slf4j.LoggerFactory;
9
10 import java.io.File;
11 import java.io.IOException;
12 import java.text.SimpleDateFormat;
13 import java.util.Date;
14 import java.util.Iterator;
15 import java.util.List;
16 import java.util.Locale;
17
18
19
20
21
22 public class CorpusPreprocessor {
23
24 private static final org.slf4j.Logger LOGGER = LoggerFactory.getLogger(CorpusPreprocessor.class);
25
26 static public KAFDocument text2naf(String text) {
27 KAFDocument doc = new KAFDocument("en", "v3");
28 doc.setRawText(text);
29
30 String date = "";
31 try {
32 date = new SimpleDateFormat("yyyy-MM-dd'T'HH:mm:ss", new Locale("en")).format(new Date());
33 } catch (Exception e) {
34 LOGGER.error(e.getMessage());
35 }
36
37 KAFDocument.Public p = doc.createPublic();
38 p.uri = "http://www.example.com";
39 p.publicId = "0";
40
41 KAFDocument.FileDesc d = doc.createFileDesc();
42 d.creationtime = date;
43 d.author = "Unknown author";
44 d.filename = "test.xml";
45 d.title = "Unknown title";
46
47 return doc;
48 }
49
50 public static void main(String[] args) {
51
52 try {
53 CommandLine cmd = null;
54 cmd = CommandLine
55 .parser()
56 .withName("corpus-preprocessor")
57 .withHeader(
58 "Convert KAF to NAF")
59 .withOption("i", "input-path", "the base EN path of the corpus", "DIR",
60 CommandLine.Type.DIRECTORY_EXISTING, true, false, true)
61 .withLogger(LoggerFactory.getLogger("eu.fbk.fssa")).parse(args);
62
63 final File inputPath = cmd.getOptionValue("i", File.class);
64 if (!inputPath.exists()) {
65 throw new IOException(String.format("Folder %s does not exist", inputPath.getAbsolutePath()));
66 }
67
68 File kafPath = new File(inputPath.getAbsolutePath() + File.separator + "kaf");
69 if (!kafPath.exists()) {
70 throw new IOException(String.format("Folder %s does not exist", kafPath.getAbsolutePath()));
71 }
72 File nafPath = new File(inputPath.getAbsolutePath() + File.separator + "naf");
73 if (nafPath.exists()) {
74 throw new IOException(String.format("Folder %s exists", nafPath.getAbsolutePath()));
75 }
76 nafPath.mkdir();
77
78 Iterator<File> fileIterator;
79 fileIterator = FileUtils.iterateFiles(kafPath, new String[]{"kaf"}, false);
80
81 while (fileIterator.hasNext()) {
82 File file = fileIterator.next();
83 String fileBaseName = FilenameUtils.removeExtension(file.getName());
84 KAFDocument document = KAFDocument.createFromFile(file);
85
86 StringBuffer buffer = new StringBuffer();
87 List<WF> wFs = document.getWFs();
88 for (WF wf : wFs) {
89 buffer.append(wf.getForm());
90 buffer.append(" ");
91 }
92 String text = buffer.toString().trim();
93
94 KAFDocument doc = text2naf(text);
95 File nafFile = new File(nafPath.getAbsolutePath() + File.separator + fileBaseName + ".naf");
96 doc.save(nafFile.getAbsolutePath());
97
98 }
99
100 } catch (final Throwable ex) {
101 CommandLine.fail(ex);
102 }
103
104 }
105 }