1   package eu.fbk.dkm.pikes.resources.signalmedia;
2   
3   import com.fasterxml.jackson.databind.ObjectMapper;
4   import eu.fbk.utils.core.CommandLine;
5   import ixa.kaflib.KAFDocument;
6   import org.slf4j.Logger;
7   import org.slf4j.LoggerFactory;
8   
9   import java.io.*;
10  import java.nio.charset.Charset;
11  import java.util.Map;
12  import java.util.zip.GZIPInputStream;
13  
14  /**
15   * Created by alessio on 28/12/15.
16   */
17  
18  public class JsonToNaf {
19  
20      private static final Logger LOGGER = LoggerFactory.getLogger(JsonToNaf.class);
21      private static final String DEFAULT_PREFIX = "http://signalmedia/";
22  
23      public static void main(String[] args) {
24          try {
25              final CommandLine cmd = CommandLine
26                      .parser()
27                      .withName("./taol-extractor")
28                      .withHeader("Convert file from SignalMedia JSON to NAF")
29                      .withOption("i", "input", "Input file", "FILE",
30                              CommandLine.Type.FILE_EXISTING, true, false, true)
31                      .withOption("o", "output", "Output folder", "FOLDER",
32                              CommandLine.Type.DIRECTORY, true, false, true)
33                      .withOption("p", "prefix", String.format("Prefix (default %s)", DEFAULT_PREFIX), "PREFIX",
34                              CommandLine.Type.STRING, true, false, false)
35                      .withOption("t", "skip-title", "Do not insert title into text")
36                      .withLogger(LoggerFactory.getLogger("eu.fbk")).parse(args);
37  
38              File inputFile = cmd.getOptionValue("input", File.class);
39              File outputFolder = cmd.getOptionValue("output", File.class);
40              String prefix = cmd.getOptionValue("prefix", String.class, DEFAULT_PREFIX);
41  
42              boolean skipTitle = cmd.hasOption("skip-title");
43  
44              if (!outputFolder.exists()) {
45                  outputFolder.mkdirs();
46              }
47  
48              InputStream fileStream = new FileInputStream(inputFile);
49              InputStream gzipStream = new GZIPInputStream(fileStream);
50              Reader decoder = new InputStreamReader(gzipStream, Charset.forName("UTF-8"));
51              BufferedReader reader = new BufferedReader(decoder);
52  
53              String line;
54              while ((line = reader.readLine()) != null) {
55                  ObjectMapper mapper = new ObjectMapper();
56                  Map<String, Object> rootNode = mapper.readValue(line, Map.class);
57  
58                  String id = (String) rootNode.get("id");
59                  String content = (String) rootNode.get("content");
60                  String title = (String) rootNode.get("title");
61                  String mediaType = (String) rootNode.get("media-type");
62                  String source = (String) rootNode.get("source");
63                  String published = (String) rootNode.get("published");
64  
65                  if (!skipTitle) {
66                      content = title + "\n\n" + content;
67                  }
68  
69                  // Fix a stupid bug in the dataset
70                  content = content.replaceAll("]]>", "");
71  
72                  String simpleID = id.replaceAll("[^0-9a-zA-Z]", "");
73                  String subFolder = simpleID.substring(0, 2);
74                  File subFolderFile = new File(outputFolder + File.separator + subFolder);
75                  subFolderFile.mkdirs();
76  
77                  String url = prefix + id;
78                  String outputFile = outputFolder + File.separator + subFolder + File.separator + id + ".naf";
79  
80                  KAFDocument document = new KAFDocument("en", "v3");
81  
82                  KAFDocument.Public documentPublic = document.createPublic();
83                  documentPublic.uri = url;
84                  documentPublic.publicId = id;
85  
86                  KAFDocument.FileDesc documentFileDesc = document.createFileDesc();
87                  documentFileDesc.filename = id + ".naf";
88                  documentFileDesc.title = title;
89                  documentFileDesc.creationtime = published;
90                  documentFileDesc.author = source;
91                  documentFileDesc.filetype = mediaType;
92  
93                  document.setRawText(content);
94  
95                  document.save(outputFile);
96              }
97              reader.close();
98  
99          } catch (Exception e) {
100             CommandLine.fail(e);
101         }
102     }
103 }