1   package eu.fbk.dkm.pikes.naflib;
2   
3   import com.google.common.base.Charsets;
4   import com.google.common.io.Files;
5   import eu.fbk.utils.core.CommandLine;
6   import ixa.kaflib.KAFDocument;
7   import org.slf4j.Logger;
8   import org.slf4j.LoggerFactory;
9   
10  import java.io.File;
11  
12  /**
13   * Created by alessio on 17/12/15.
14   */
15  
16  public class TxtToNaf {
17  
18      private static final Logger LOGGER = LoggerFactory.getLogger(TxtToNaf.class);
19      private static final String DEFAULT_PREFIX = "http://unknown/";
20      private static STRATEGY DEFAULT_STRATEGY = STRATEGY.FILENAME;
21  
22      private enum STRATEGY {FILENAME, FIRSTLINE}
23  
24      public static void main(String[] args) {
25          try {
26              final CommandLine cmd = CommandLine
27                      .parser()
28                      .withName("./nafizer")
29                      .withHeader("Convert list of TXT files to NAF")
30                      .withOption("i", "input", "Input folder", "FOLDER",
31                              CommandLine.Type.DIRECTORY_EXISTING, true, false, true)
32                      .withOption("o", "output", "Output folder", "FOLDER",
33                              CommandLine.Type.DIRECTORY, true, false, true)
34                      .withOption("p", "prefix", String.format("Prefix (default %s)", DEFAULT_PREFIX), "PREFIX",
35                              CommandLine.Type.STRING, true, false, false)
36                      .withOption("t", "title-strategy", String.format("Title strategy (default: %s)", DEFAULT_STRATEGY),
37                              "strategy",
38                              CommandLine.Type.STRING, true, false, false)
39                      .withOption(null, "trim", "Trim text")
40                      .withLogger(LoggerFactory.getLogger("eu.fbk")).parse(args);
41  
42              File inputFolder = cmd.getOptionValue("input", File.class);
43              File outputFolder = cmd.getOptionValue("output", File.class);
44              String prefix = cmd.getOptionValue("prefix", String.class, DEFAULT_PREFIX);
45  
46              boolean trimText = cmd.hasOption("trim");
47  
48              STRATEGY strategy;
49  
50              try {
51                  strategy = STRATEGY.valueOf(cmd.getOptionValue("title-strategy", String.class));
52              } catch (Exception e) {
53                  strategy = STRATEGY.FILENAME;
54              }
55  
56              if (!outputFolder.exists()) {
57                  outputFolder.mkdirs();
58              }
59  
60              int i = 0;
61              for (final File file : Files.fileTreeTraverser().preOrderTraversal(inputFolder)) {
62                  if (!file.isFile()) {
63                      continue;
64                  }
65                  if (file.getName().startsWith(".")) {
66                      continue;
67                  }
68                  if (!file.getName().endsWith(".txt")) {
69                      continue;
70                  }
71  
72                  String fileContent = Files.toString(file, Charsets.UTF_8);
73                  if (trimText) {
74                      fileContent = fileContent.trim();
75                  }
76  
77                  if (fileContent == null || fileContent.length() == 0) {
78                      continue;
79                  }
80  
81                  i++;
82  
83                  File outputFile = new File(
84                          outputFolder.getAbsolutePath() + File.separator +
85                                  file.getAbsolutePath().substring(inputFolder.getAbsolutePath().length()) + ".naf");
86                  Files.createParentDirs(outputFile);
87  
88                  String title = null;
89                  switch (strategy) {
90                  case FILENAME:
91                      title = file.getName();
92                      break;
93                  case FIRSTLINE:
94                      String[] parts = fileContent.split("\n");
95                      title = parts[0].trim();
96                      break;
97                  }
98  
99                  KAFDocument document = new KAFDocument("en", "v3");
100 
101                 KAFDocument.Public documentPublic = document.createPublic();
102                 documentPublic.uri = prefix + i;
103                 documentPublic.publicId = "" + i;
104 
105                 KAFDocument.FileDesc documentFileDesc = document.createFileDesc();
106                 documentFileDesc.filename = file.getName();
107                 documentFileDesc.title = title;
108 
109                 document.setRawText(fileContent);
110                 LOGGER.info(outputFile.getAbsolutePath());
111                 document.save(outputFile.getAbsolutePath());
112             }
113         } catch (Exception e) {
114             CommandLine.fail(e);
115         }
116     }
117 }