1 package eu.fbk.dkm.pikes.resources.conllAIDA;
2
3 import com.google.common.io.Files;
4 import eu.fbk.rdfpro.util.IO;
5 import eu.fbk.utils.core.CommandLine;
6 import ixa.kaflib.Entity;
7 import ixa.kaflib.ExternalRef;
8 import ixa.kaflib.KAFDocument;
9 import ixa.kaflib.Term;
10 import org.apache.commons.lang.StringUtils;
11 import org.slf4j.LoggerFactory;
12
13 import java.io.*;
14 import java.text.SimpleDateFormat;
15 import java.util.*;
16 import java.util.stream.Collectors;
17
18
19
20
21 public class DistillEntities {
22
23 private static final org.slf4j.Logger LOGGER = LoggerFactory.getLogger(DistillEntities.class);
24 private static SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd'T'HH:mm:ss.SSSZ");
25
26
27 public enum removeLayer {
28 deps, chunks, entities, properties, categories, coreferences, opinions, relations, srl, constituency, timeExpressions, linkedEntities, constituencyStrings;
29 }
30
31 public static void main(String[] args) {
32 try {
33 final CommandLine cmd = CommandLine
34 .parser()
35 .withName("stripNAF")
36 .withHeader("Strip NAF files of unnecessary layers")
37 .withOption("i", "input-folder", "the folder of the input NAF corpus", "DIR", CommandLine.Type.DIRECTORY_EXISTING, true, false, true)
38 .withOption("o", "output-folder", "the folder of the input NAF corpus", "DIR", CommandLine.Type.DIRECTORY, true, false, true)
39 .withOption("m", "mode", "modality (1=entity-centric, 2=term-centric)", "INT", CommandLine.Type.NON_NEGATIVE_INTEGER, true, false, false)
40 .withLogger(LoggerFactory.getLogger("eu.fbk")).parse(args);
41
42 File inputFolder = cmd.getOptionValue("input-folder", File.class);
43 File outputFolder = cmd.getOptionValue("output-folder", File.class);
44 Integer modality = cmd.getOptionValue("mode", Integer.class,1);
45
46 for (final File file : Files.fileTreeTraverser().preOrderTraversal(inputFolder)) {
47 if (!file.isFile()) {
48 continue;
49 }
50 if (file.getName().startsWith(".")) {
51 continue;
52 }
53
54 if (!file.getName().endsWith(".naf.gz")) {
55 continue;
56 }
57
58 System.out.print("Processing: "+file.getAbsoluteFile().toString());
59
60 String inputfilenamelocal = file.getName();
61 String outputfilenamelocal = StringUtils.leftPad(inputfilenamelocal.replace(".naf",".tsv").replace("conll-",""), 11, '0');
62
63
64 String outputfilename = file.getAbsoluteFile().toString().replace(inputFolder.getAbsolutePath(),outputFolder.getAbsolutePath()).replace(inputfilenamelocal,outputfilenamelocal);
65
66 File outputFile = new File(outputfilename);
67
68 if (!outputFile.exists()) {
69
70 try (Reader reader = IO.utf8Reader(IO.buffer(IO.read(file.getAbsoluteFile().toString())))) {
71 try {
72
73
74
75 KAFDocument document = KAFDocument.createFromStream(reader);
76 reader.close();
77
78 HashMap<Integer,String> toPrint = new HashMap();
79 String ID = document.getPublic().publicId;
80
81 if (modality==1) {
82
83
84 final List<Entity> entities = document.getEntities();
85
86 for (Entity entity : entities
87 ) {
88 String type = entity.getType();
89 List<ExternalRef> externalRefs = entity.getExternalRefs();
90 HashMap<String, Float> refs = new HashMap();
91 for (ExternalRef exref : externalRefs
92 ) {
93 if (exref.getResource().equalsIgnoreCase("dbpedia-candidates")) {
94 String ref = exref.getReference();
95 float confidence = exref.getConfidence();
96 refs.put(ref, confidence);
97 } else refs.put("null", 0.0f);
98 }
99
100 String span = entity.getStr();
101 String IDs = "";
102 Integer spanID = Integer.parseInt(entity.getTerms().get(0).getId().replace("t", "")) - 1;
103 for (Term t : entity.getTerms()
104 ) {
105 IDs += String.valueOf(Integer.parseInt(t.getId().replace("t", "")) - 1) + " ";
106 }
107
108
109 String references = refs.entrySet().stream()
110 .sorted(Map.Entry.<String, Float>comparingByValue().reversed()).map(e -> e.getKey().replace("http://dbpedia.org/resource/", "") + " [" + e.getValue() + "]").collect(Collectors.joining(" "));
111 String line = ID.replace("conll-", "") + "\t" + IDs + "\t" + span + "\t" + type + "\t" + references;
112
113 toPrint.put(spanID, line);
114 }
115 } else if (modality==2) {
116
117 final List<Term> terms = document.getTerms();
118 Collections.sort(terms, new Comparator<Term>() {
119 @Override
120 public int compare(Term o1, Term o2) {
121 return Integer.compare(o1.getOffset(),o2.getOffset());
122 }
123 });
124 for (Term t : terms
125 ) {
126
127 Integer t_ID = Integer.parseInt(t.getId().replace("t",""))-1;
128
129 String line = t.getForm() + "\t" + t_ID;
130
131
132 List<Entity> entities = document.getEntitiesByTerm(t);
133
134
135
136 if (entities.size()>0) {
137 Entity entity = entities.get(0);
138 String type = entity.getType();
139
140 List<ExternalRef> externalRefs = entity.getExternalRefs();
141 HashMap<String, Float> refs = new HashMap();
142 for (ExternalRef exref : externalRefs
143 ) {
144 if (exref.getResource().equalsIgnoreCase("dbpedia-candidates")) {
145 String ref = exref.getReference();
146 float confidence = exref.getConfidence();
147 refs.put(ref, confidence);
148 } else refs.put("null", 0.0f);
149 }
150 String span = entity.getStr();
151 String IDs = "";
152 Integer spanID = Integer.parseInt(entity.getTerms().get(0).getId().replace("t", "")) - 1;
153 for (Term tt : entity.getTerms()
154 ) {
155 IDs += String.valueOf(Integer.parseInt(t.getId().replace("t", "")) - 1) + " ";
156 }
157
158
159 String references = refs.entrySet().stream()
160 .sorted(Map.Entry.<String, Float>comparingByValue().reversed()).map(e -> e.getKey().replace("http://dbpedia.org/resource/", "") + " [" + e.getValue() + "]").collect(Collectors.joining(" "));
161
162 line += "\t" + type + "\t" + references;
163
164
165 }
166 toPrint.put(t_ID, line);
167 }
168
169 }
170
171
172
173
174 Files.createParentDirs(outputFile);
175 try (Writer w = IO.utf8Writer(IO.buffer(IO.write(outputFile.getAbsolutePath())))) {
176 toPrint.entrySet().stream()
177 .sorted(Map.Entry.<Integer, String>comparingByKey()).map(e -> e.getValue()).forEach(e -> {
178 try {
179 w.write(e+"\n");
180 } catch (IOException e1) {
181 e1.printStackTrace();
182 }
183 });
184
185 w.close();
186
187
188 } catch (IOException ee) {
189 ee.printStackTrace();
190 }
191
192 System.out.println(" DONE!");
193
194
195 } catch (Exception e) {
196
197 }
198
199 }
200
201
202 }
203
204 }
205 } catch (FileNotFoundException e) {
206 e.printStackTrace();
207 } catch (IOException e) {
208 e.printStackTrace();
209 }
210 }
211 }