1 package eu.fbk.dkm.pikes.resources.conllAIDA;
2
3 import eu.fbk.utils.core.CommandLine;
4 import ixa.kaflib.KAFDocument;
5 import org.apache.commons.lang.StringUtils;
6 import org.json.simple.JSONObject;
7 import org.json.simple.parser.JSONParser;
8 import org.slf4j.LoggerFactory;
9
10 import java.io.File;
11 import java.io.IOException;
12 import java.nio.file.Files;
13 import java.nio.file.Paths;
14 import java.text.SimpleDateFormat;
15 import java.util.ArrayList;
16 import java.util.Date;
17 import java.util.List;
18 import java.util.stream.Collectors;
19 import java.util.stream.Stream;
20
21
22
23
24
25 public class ConvertDocsFromGS {
26
27 private static SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd'T'HH:mm:ss.SSSZ");
28 private static String DEFAULT_URL = "http://pikes.fbk.eu/conll/";
29
30 public static void main(String[] args) throws Exception {
31
32
33 final CommandLine cmd = CommandLine
34 .parser()
35 .withName("ConvertDocsFromGS")
36 .withHeader("Generates < YAGO entity, rdf:type , NER type> triples")
37 .withOption("c", "conll", "CONLL folder", "FOLDER", CommandLine.Type.DIRECTORY, true, false, true)
38 .withOption("o", "output", "Output file", "FOLDER", CommandLine.Type.DIRECTORY, true,
39 false, true)
40 .withOption("u", "url-template", "URL template (with %d for the ID)", "URL",
41 CommandLine.Type.STRING, true, false, false)
42 .withLogger(LoggerFactory.getLogger("eu.fbk"))
43 .parse(args);
44
45 File conllfolder = cmd.getOptionValue("conll", File.class);
46 File outputfile = cmd.getOptionValue("output", File.class);
47
48 List<String> conll_list = new ArrayList<>();
49
50 try (Stream<String> stream = Files.lines(Paths.get(conllfolder.toString()+"/eng.train"))) {
51
52 conll_list.addAll(stream
53
54
55 .collect(Collectors.toList()));
56
57 } catch (IOException e) {
58 e.printStackTrace();
59 }
60
61
62 conll_list.add("-DOCSTART- -X- O O");
63
64 try (Stream<String> stream = Files.lines(Paths.get(conllfolder.toString()+"/eng.testa"))) {
65
66
67 conll_list.addAll(stream
68
69
70 .collect(Collectors.toList()));
71
72 } catch (IOException e) {
73 e.printStackTrace();
74 }
75
76
77 conll_list.add("-DOCSTART- -X- O O");
78
79 try (Stream<String> stream = Files.lines(Paths.get(conllfolder.toString()+"/eng.testb"))) {
80 conll_list.addAll(stream
81
82
83 .collect(Collectors.toList()));
84
85 } catch (IOException e) {
86 e.printStackTrace();
87 }
88
89 Integer ID=1;
90 conll_list.remove(0);
91 conll_list.add("-DOCSTART-");
92
93 String text="";
94
95 for (String line : conll_list
96 ) {
97
98 if (line.startsWith("-DOCSTART-")) {
99
100 if (!text.isEmpty()) {
101 File outputFile = new File(outputfile.getAbsoluteFile().toString() + "/" + StringUtils.leftPad(ID.toString(),4,"0") + ".naf");
102
103
104 outputFile.getParentFile().mkdirs();
105 KAFDocument document = new KAFDocument("en", "v3");
106
107 document.save(outputFile.getAbsolutePath());
108
109 document.setRawText(text);
110
111 KAFDocument.FileDesc fileDesc = document.createFileDesc();
112 fileDesc.title = ID.toString();
113
114 Date thisDate = new Date();
115
116 fileDesc.creationtime = sdf.format(thisDate);
117 String URL_str = ID.toString();
118 fileDesc.filename = URL_str;
119
120 String urlTemplate = DEFAULT_URL;
121 if (cmd.hasOption("url-template")) {
122 urlTemplate = cmd.getOptionValue("url-template", String.class);
123 }
124
125 KAFDocument.Public aPublic = document.createPublic();
126
127 aPublic.uri = urlTemplate + ID.toString();
128 aPublic.publicId = ID.toString();
129
130 document.save(outputFile.getAbsolutePath());
131 text="";
132 ID++;
133 }
134
135 } else if (line.isEmpty()) text+="\n";
136 else {
137 String[] conll_item = line.split(" ");
138 text+=conll_item[0]+" ";
139 }
140
141 }
142 }
143
144 }