1 package eu.fbk.dkm.pikes.resources.conllAIDA;
2
3 import eu.fbk.utils.core.CommandLine;
4 import ixa.kaflib.KAFDocument;
5 import org.apache.commons.lang.StringUtils;
6 import org.slf4j.LoggerFactory;
7
8 import java.io.File;
9 import java.io.IOException;
10 import java.nio.file.Files;
11 import java.nio.file.Paths;
12 import java.text.SimpleDateFormat;
13 import java.util.ArrayList;
14 import java.util.Date;
15 import java.util.List;
16 import java.util.stream.Collectors;
17 import java.util.stream.Stream;
18
19
20
21
22 public class ConvertDocsFromAIDAGS {
23
24 private static SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd'T'HH:mm:ss.SSSZ");
25 private static String DEFAULT_URL = "http://pikes.fbk.eu/conll/";
26
27 public static void main(String[] args) throws Exception {
28
29
30 final CommandLine cmd = CommandLine
31 .parser()
32 .withName("ConvertDocsFromAIDAGS")
33 .withHeader("Generates < YAGO entity, rdf:type , NER type> triples")
34 .withOption("a", "aida", "AIDA-YAGO2-dataset.tsv", "FILE", CommandLine.Type.FILE, true, false, true)
35 .withOption("o", "output", "Output file", "FOLDER", CommandLine.Type.DIRECTORY, true,
36 false, true)
37 .withOption("u", "url-template", "URL template (with %d for the ID)", "URL",
38 CommandLine.Type.STRING, true, false, false)
39 .withLogger(LoggerFactory.getLogger("eu.fbk"))
40 .parse(args);
41
42 File aidagold = cmd.getOptionValue("aida", File.class);
43 File outputfile = cmd.getOptionValue("output", File.class);
44
45 List<String> conll_list = new ArrayList<>();
46
47 try (Stream<String> stream = Files.lines(Paths.get(aidagold.toString()))) {
48
49 conll_list.addAll(stream
50
51
52 .collect(Collectors.toList()));
53
54 } catch (IOException e) {
55 e.printStackTrace();
56 }
57
58
59
60 String IDstr = getID(conll_list.get(0));
61 conll_list.remove(0);
62 conll_list.add("-DOCSTART-");
63
64 String text="";
65
66 for (String line : conll_list
67 ) {
68
69 if (line.startsWith("-DOCSTART-")) {
70
71
72
73 if (!text.isEmpty()) {
74
75 Integer ID=Integer.parseInt(IDstr.split(" ")[0].replace("testa","").replace("testb",""));
76
77 File outputFile = new File(outputfile.getAbsoluteFile().toString() + "/" + StringUtils.leftPad(ID.toString(),4,"0") + ".naf");
78
79
80 outputFile.getParentFile().mkdirs();
81 KAFDocument document = new KAFDocument("en", "v3");
82
83 document.save(outputFile.getAbsolutePath());
84
85 document.setRawText(text);
86
87 KAFDocument.FileDesc fileDesc = document.createFileDesc();
88 fileDesc.title = ID.toString();
89
90 Date thisDate = new Date();
91
92 fileDesc.creationtime = sdf.format(thisDate);
93 String URL_str = ID.toString();
94 fileDesc.filename = URL_str;
95
96 String urlTemplate = DEFAULT_URL;
97 if (cmd.hasOption("url-template")) {
98 urlTemplate = cmd.getOptionValue("url-template", String.class);
99 }
100
101 KAFDocument.Public aPublic = document.createPublic();
102
103 aPublic.uri = urlTemplate + ID.toString();
104 aPublic.publicId = IDstr;
105
106
107
108
109 document.save(outputFile.getAbsolutePath());
110 text="";
111
112
113 }
114
115 if (!line.equals("-DOCSTART-")) IDstr=getID(line);
116
117 } else if (line.isEmpty()) text+="\n";
118 else {
119 String[] conll_item = line.split("\t");
120 text+=conll_item[0]+" ";
121 }
122
123 }
124
125
126 }
127
128 private static String getID (String line){
129
130
131 return line.substring(line.indexOf("(")+1,line.indexOf(")"));
132
133 }
134
135 }