1 package eu.fbk.dkm.pikes.resources.signalmedia;
2
3 import com.fasterxml.jackson.databind.ObjectMapper;
4 import eu.fbk.utils.core.CommandLine;
5 import ixa.kaflib.KAFDocument;
6 import org.slf4j.Logger;
7 import org.slf4j.LoggerFactory;
8
9 import java.io.*;
10 import java.nio.charset.Charset;
11 import java.util.Map;
12 import java.util.zip.GZIPInputStream;
13
14
15
16
17
18 public class JsonToNaf {
19
20 private static final Logger LOGGER = LoggerFactory.getLogger(JsonToNaf.class);
21 private static final String DEFAULT_PREFIX = "http://signalmedia/";
22
23 public static void main(String[] args) {
24 try {
25 final CommandLine cmd = CommandLine
26 .parser()
27 .withName("./taol-extractor")
28 .withHeader("Convert file from SignalMedia JSON to NAF")
29 .withOption("i", "input", "Input file", "FILE",
30 CommandLine.Type.FILE_EXISTING, true, false, true)
31 .withOption("o", "output", "Output folder", "FOLDER",
32 CommandLine.Type.DIRECTORY, true, false, true)
33 .withOption("p", "prefix", String.format("Prefix (default %s)", DEFAULT_PREFIX), "PREFIX",
34 CommandLine.Type.STRING, true, false, false)
35 .withOption("t", "skip-title", "Do not insert title into text")
36 .withLogger(LoggerFactory.getLogger("eu.fbk")).parse(args);
37
38 File inputFile = cmd.getOptionValue("input", File.class);
39 File outputFolder = cmd.getOptionValue("output", File.class);
40 String prefix = cmd.getOptionValue("prefix", String.class, DEFAULT_PREFIX);
41
42 boolean skipTitle = cmd.hasOption("skip-title");
43
44 if (!outputFolder.exists()) {
45 outputFolder.mkdirs();
46 }
47
48 InputStream fileStream = new FileInputStream(inputFile);
49 InputStream gzipStream = new GZIPInputStream(fileStream);
50 Reader decoder = new InputStreamReader(gzipStream, Charset.forName("UTF-8"));
51 BufferedReader reader = new BufferedReader(decoder);
52
53 String line;
54 while ((line = reader.readLine()) != null) {
55 ObjectMapper mapper = new ObjectMapper();
56 Map<String, Object> rootNode = mapper.readValue(line, Map.class);
57
58 String id = (String) rootNode.get("id");
59 String content = (String) rootNode.get("content");
60 String title = (String) rootNode.get("title");
61 String mediaType = (String) rootNode.get("media-type");
62 String source = (String) rootNode.get("source");
63 String published = (String) rootNode.get("published");
64
65 if (!skipTitle) {
66 content = title + "\n\n" + content;
67 }
68
69
70 content = content.replaceAll("]]>", "");
71
72 String simpleID = id.replaceAll("[^0-9a-zA-Z]", "");
73 String subFolder = simpleID.substring(0, 2);
74 File subFolderFile = new File(outputFolder + File.separator + subFolder);
75 subFolderFile.mkdirs();
76
77 String url = prefix + id;
78 String outputFile = outputFolder + File.separator + subFolder + File.separator + id + ".naf";
79
80 KAFDocument document = new KAFDocument("en", "v3");
81
82 KAFDocument.Public documentPublic = document.createPublic();
83 documentPublic.uri = url;
84 documentPublic.publicId = id;
85
86 KAFDocument.FileDesc documentFileDesc = document.createFileDesc();
87 documentFileDesc.filename = id + ".naf";
88 documentFileDesc.title = title;
89 documentFileDesc.creationtime = published;
90 documentFileDesc.author = source;
91 documentFileDesc.filetype = mediaType;
92
93 document.setRawText(content);
94
95 document.save(outputFile);
96 }
97 reader.close();
98
99 } catch (Exception e) {
100 CommandLine.fail(e);
101 }
102 }
103 }