1 package eu.fbk.dkm.pikes.naflib;
2
3 import com.google.common.base.Charsets;
4 import com.google.common.io.Files;
5 import eu.fbk.utils.core.CommandLine;
6 import ixa.kaflib.KAFDocument;
7 import org.slf4j.Logger;
8 import org.slf4j.LoggerFactory;
9
10 import java.io.File;
11
12
13
14
15
16 public class TxtToNaf {
17
18 private static final Logger LOGGER = LoggerFactory.getLogger(TxtToNaf.class);
19 private static final String DEFAULT_PREFIX = "http://unknown/";
20 private static STRATEGY DEFAULT_STRATEGY = STRATEGY.FILENAME;
21
22 private enum STRATEGY {FILENAME, FIRSTLINE}
23
24 public static void main(String[] args) {
25 try {
26 final CommandLine cmd = CommandLine
27 .parser()
28 .withName("./nafizer")
29 .withHeader("Convert list of TXT files to NAF")
30 .withOption("i", "input", "Input folder", "FOLDER",
31 CommandLine.Type.DIRECTORY_EXISTING, true, false, true)
32 .withOption("o", "output", "Output folder", "FOLDER",
33 CommandLine.Type.DIRECTORY, true, false, true)
34 .withOption("p", "prefix", String.format("Prefix (default %s)", DEFAULT_PREFIX), "PREFIX",
35 CommandLine.Type.STRING, true, false, false)
36 .withOption("t", "title-strategy", String.format("Title strategy (default: %s)", DEFAULT_STRATEGY),
37 "strategy",
38 CommandLine.Type.STRING, true, false, false)
39 .withOption(null, "trim", "Trim text")
40 .withLogger(LoggerFactory.getLogger("eu.fbk")).parse(args);
41
42 File inputFolder = cmd.getOptionValue("input", File.class);
43 File outputFolder = cmd.getOptionValue("output", File.class);
44 String prefix = cmd.getOptionValue("prefix", String.class, DEFAULT_PREFIX);
45
46 boolean trimText = cmd.hasOption("trim");
47
48 STRATEGY strategy;
49
50 try {
51 strategy = STRATEGY.valueOf(cmd.getOptionValue("title-strategy", String.class));
52 } catch (Exception e) {
53 strategy = STRATEGY.FILENAME;
54 }
55
56 if (!outputFolder.exists()) {
57 outputFolder.mkdirs();
58 }
59
60 int i = 0;
61 for (final File file : Files.fileTreeTraverser().preOrderTraversal(inputFolder)) {
62 if (!file.isFile()) {
63 continue;
64 }
65 if (file.getName().startsWith(".")) {
66 continue;
67 }
68 if (!file.getName().endsWith(".txt")) {
69 continue;
70 }
71
72 String fileContent = Files.toString(file, Charsets.UTF_8);
73 if (trimText) {
74 fileContent = fileContent.trim();
75 }
76
77 if (fileContent == null || fileContent.length() == 0) {
78 continue;
79 }
80
81 i++;
82
83 File outputFile = new File(
84 outputFolder.getAbsolutePath() + File.separator +
85 file.getAbsolutePath().substring(inputFolder.getAbsolutePath().length()) + ".naf");
86 Files.createParentDirs(outputFile);
87
88 String title = null;
89 switch (strategy) {
90 case FILENAME:
91 title = file.getName();
92 break;
93 case FIRSTLINE:
94 String[] parts = fileContent.split("\n");
95 title = parts[0].trim();
96 break;
97 }
98
99 KAFDocument document = new KAFDocument("en", "v3");
100
101 KAFDocument.Public documentPublic = document.createPublic();
102 documentPublic.uri = prefix + i;
103 documentPublic.publicId = "" + i;
104
105 KAFDocument.FileDesc documentFileDesc = document.createFileDesc();
106 documentFileDesc.filename = file.getName();
107 documentFileDesc.title = title;
108
109 document.setRawText(fileContent);
110 LOGGER.info(outputFile.getAbsolutePath());
111 document.save(outputFile.getAbsolutePath());
112 }
113 } catch (Exception e) {
114 CommandLine.fail(e);
115 }
116 }
117 }