1 package eu.fbk.dkm.pikes.rdf;
2
3 import java.io.File;
4 import java.io.Reader;
5 import java.io.StringWriter;
6 import java.io.Writer;
7 import java.util.Collections;
8 import java.util.List;
9 import java.util.Map;
10 import java.util.Set;
11
12 import com.google.common.base.Charsets;
13 import com.google.common.base.Strings;
14 import com.google.common.collect.HashMultimap;
15 import com.google.common.collect.Lists;
16 import com.google.common.collect.Maps;
17 import com.google.common.collect.Multimap;
18 import com.google.common.collect.Ordering;
19 import com.google.common.collect.Sets;
20 import com.google.common.io.Files;
21
22 import org.eclipse.rdf4j.model.IRI;
23 import org.eclipse.rdf4j.model.vocabulary.OWL;
24 import org.eclipse.rdf4j.model.vocabulary.RDF;
25 import org.eclipse.rdf4j.model.vocabulary.RDFS;
26 import org.eclipse.rdf4j.rio.RDFFormat;
27 import org.eclipse.rdf4j.rio.RDFHandler;
28 import org.eclipse.rdf4j.rio.RDFHandlerException;
29 import org.eclipse.rdf4j.rio.RDFWriter;
30 import org.eclipse.rdf4j.rio.Rio;
31 import org.slf4j.Logger;
32 import org.slf4j.LoggerFactory;
33
34 import ixa.kaflib.Coref;
35 import ixa.kaflib.Entity;
36 import ixa.kaflib.ExternalRef;
37 import ixa.kaflib.KAFDocument;
38 import ixa.kaflib.LinkedEntity;
39 import ixa.kaflib.Predicate;
40 import ixa.kaflib.Predicate.Role;
41 import ixa.kaflib.Span;
42 import ixa.kaflib.Term;
43 import ixa.kaflib.WF;
44
45 import eu.fbk.dkm.pikes.resources.NAFUtils;
46 import eu.fbk.rdfpro.util.IO;
47 import eu.fbk.rdfpro.util.Statements;
48
49 public class AnnotationHelper {
50
51 private static final Logger LOGGER = LoggerFactory.getLogger(AnnotationHelper.class);
52
53 private static final String NS_NB = "http://pikes.fbk.eu/ontologies/nombank#";
54
55 private static final String NS_PB = "http://pikes.fbk.eu/ontologies/propbank#";
56
57 private static final String NS_VN = "http://pikes.fbk.eu/ontologies/verbnet#";
58
59 private static final String NS_FN = "http://pikes.fbk.eu/ontologies/framenet#";
60
61 private static final String PARTICIPATION_REGEX = ""
62 + "SUB? (COORD CONJ?)* (PMOD (COORD CONJ?)*)? ((VC OPRD?)|(IM OPRD?))*";
63
64 public static void main(final String... args) {
65
66 for (final String arg : args) {
67
68 LOGGER.info("Processing {} ...", arg);
69
70 final String lcaseName = arg.toLowerCase();
71 int index = lcaseName.lastIndexOf(".naf");
72 if (index < 0) {
73 index = lcaseName.lastIndexOf(".xml");
74 if (index < 0) {
75 index = lcaseName.length();
76 }
77 }
78 final String prefix = arg.substring(0, index);
79
80 try (Reader reader = IO.utf8Reader(IO.read(arg))) {
81 final KAFDocument document = KAFDocument.createFromStream(reader);
82 for (int i = 1; i <= document.getNumSentences(); ++i) {
83 final Writer writer = new StringWriter();
84 final RDFWriter rdfWriter = Rio.createWriter(RDFFormat.TURTLE, writer);
85 process(document, i, rdfWriter);
86 String rdf = writer.toString();
87 rdf = rdf.replace("\n\n", "\n");
88 rdf = rdf.replace("# \n", "\n");
89 Files.write(rdf, new File(prefix + "." + i + ".ttl"), Charsets.UTF_8);
90 }
91 } catch (final Throwable ex) {
92 LOGGER.error("Failed to process " + arg, ex);
93 }
94 }
95 LOGGER.info("Done");
96 }
97
98 public static void process(final KAFDocument document, final int sentence,
99 final RDFHandler handler) throws RDFHandlerException {
100
101 final List<Term> terms = document.getSentenceTerms(sentence);
102 final String text = getText(terms);
103
104 final String ns = document.getPublic().uri + "." + sentence + "#";
105 final Map<Term, IRI> termIRIs = getIRIs(terms, ns);
106
107 handler.startRDF();
108 handler.handleNamespace("rdfs", RDFS.NAMESPACE);
109 handler.handleNamespace("owl", OWL.NAMESPACE);
110 handler.handleNamespace("dbpedia", "http://dbpedia.org/resource/");
111 handler.handleNamespace("pb", NS_PB);
112 handler.handleNamespace("nb", NS_NB);
113 handler.handleNamespace("vn", NS_VN);
114 handler.handleNamespace("fn", NS_FN);
115 handler.handleNamespace("", ns);
116
117 handler.handleComment("");
118 handler.handleComment("");
119 handler.handleComment("=== TEXT ===");
120 handler.handleComment("");
121
122 handler.handleStatement(Statements.VALUE_FACTORY.createStatement(Statements.VALUE_FACTORY.createIRI(ns), RDFS.LABEL, Statements.VALUE_FACTORY.createLiteral(
123 "\n\t" + text)));
124
125 handler.handleComment("");
126 handler.handleComment("");
127 handler.handleComment("=== COREFERENCE ===");
128
129 for (final Coref coref : document.getCorefs()) {
130 final List<Span<Term>> spans = Lists.newArrayList();
131 for (final Span<Term> span : coref.getSpans()) {
132 if (span.getFirstTarget().getSent() == sentence) {
133 spans.add(span);
134 }
135 }
136 if (spans.size() > 1) {
137 final StringBuilder builder = new StringBuilder();
138 int index = 0;
139 for (final Span<Term> span : spans) {
140 builder.append(index == 0 ? "" : " ").append("span").append(++index)
141 .append("='").append(getText(span.getTargets())).append("'");
142 }
143 handler.handleComment("");
144 handler.handleComment(builder.toString());
145 final List<IRI> headIRIs = Lists.newArrayList();
146 for (final Span<Term> span : spans) {
147 final Term head = NAFUtils.extractHead(document, span);
148 headIRIs.add(termIRIs.get(head));
149 }
150 Collections.sort(headIRIs, Statements.valueComparator());
151 for (int i = 1; i < headIRIs.size(); ++i) {
152 handler.handleStatement(Statements.VALUE_FACTORY.createStatement(headIRIs.get(0), OWL.SAMEAS,
153 headIRIs.get(i)));
154 }
155 }
156 }
157
158 handler.handleComment("");
159 handler.handleComment("");
160 handler.handleComment("=== LINKING ===");
161 handler.handleComment("");
162
163 final Multimap<IRI, IRI> links = HashMultimap.create();
164 for (final LinkedEntity entity : document.getLinkedEntities()) {
165 if (entity.getWFs().getFirstTarget().getSent() == sentence) {
166 final Span<Term> span = KAFDocument.newTermSpan(document.getTermsByWFs(entity
167 .getWFs().getTargets()));
168 final Term head = NAFUtils.extractHead(document, span);
169 links.put(termIRIs.get(head), Statements.VALUE_FACTORY.createIRI(entity.getReference()));
170 }
171 }
172 for (final Entity entity : document.getEntities()) {
173 if (entity.getSpans().get(0).getFirstTarget().getSent() == sentence) {
174 final Term head = NAFUtils.extractHead(document, entity.getSpans().get(0));
175 for (final ExternalRef ref : entity.getExternalRefs()) {
176 if (ref.getResource().toLowerCase().contains("spotlight")) {
177 links.put(termIRIs.get(head), Statements.VALUE_FACTORY.createIRI(ref.getReference()));
178 }
179 }
180 }
181 }
182 for (final IRI termIRI : Ordering.from(Statements.valueComparator()).sortedCopy(
183 links.keySet())) {
184 for (final IRI linkIRI : Ordering.from(Statements.valueComparator()).sortedCopy(
185 links.get(termIRI))) {
186 handler.handleStatement(Statements.VALUE_FACTORY.createStatement(termIRI, OWL.SAMEAS, linkIRI));
187 }
188 }
189
190 handler.handleComment("");
191 handler.handleComment("");
192 handler.handleComment("=== FRAMES ===");
193
194 for (final Predicate pred : document.getPredicatesBySent(sentence)) {
195
196 final Term predTerm = NAFUtils.extractHead(document, pred.getSpan());
197 final IRI predIRI = termIRIs.get(predTerm);
198
199 final StringBuilder builder = new StringBuilder();
200 builder.append("pred='").append(predTerm.getStr()).append("'");
201 for (final Role role : pred.getRoles()) {
202 builder.append(" ").append(role.getSemRole()).append("='")
203 .append(getText(role.getTerms())).append("'");
204 }
205 handler.handleComment("");
206 handler.handleComment(builder.toString());
207
208 final List<IRI> typeIRIs = Lists.newArrayList();
209 for (final ExternalRef ref : pred.getExternalRefs()) {
210 if (Strings.isNullOrEmpty(ref.getReference())) {
211 continue;
212 } else if (NAFUtils.RESOURCE_PROPBANK.equals(ref.getResource())) {
213 typeIRIs.add(Statements.VALUE_FACTORY.createIRI(NS_PB + ref.getReference()));
214 } else if (NAFUtils.RESOURCE_NOMBANK.equals(ref.getResource())) {
215 typeIRIs.add(Statements.VALUE_FACTORY.createIRI(NS_NB + ref.getReference()));
216 } else if (NAFUtils.RESOURCE_VERBNET.equals(ref.getResource())) {
217 typeIRIs.add(Statements.VALUE_FACTORY.createIRI(NS_VN + ref.getReference()));
218 } else if (NAFUtils.RESOURCE_FRAMENET.equals(ref.getResource())) {
219 typeIRIs.add(Statements.VALUE_FACTORY.createIRI(NS_FN + ref.getReference()));
220 }
221 }
222 Collections.sort(typeIRIs, Statements.valueComparator());
223 for (final IRI typeIRI : typeIRIs) {
224 handler.handleStatement(Statements.VALUE_FACTORY.createStatement(predIRI, RDF.TYPE, typeIRI));
225 }
226
227 for (final Role role : pred.getRoles()) {
228
229 final Set<IRI> roleIRIs = Sets.newHashSet();
230 roleIRIs.add(Statements.VALUE_FACTORY.createIRI((predTerm.getMorphofeat().startsWith("VB") ? NS_PB
231 : NS_NB) + role.getSemRole().toLowerCase()));
232 for (final ExternalRef ref : role.getExternalRefs()) {
233 String id = ref.getReference().toLowerCase();
234 final int index = id.lastIndexOf('@');
235 id = index < 0 ? id : id.substring(index + 1);
236 if (Strings.isNullOrEmpty(id)) {
237 continue;
238 } else if (NAFUtils.RESOURCE_VERBNET.equals(ref.getResource())) {
239 roleIRIs.add(Statements.VALUE_FACTORY.createIRI(NS_VN + id));
240 } else if (NAFUtils.RESOURCE_FRAMENET.equals(ref.getResource())) {
241 roleIRIs.add(Statements.VALUE_FACTORY.createIRI(NS_FN + id));
242 }
243 }
244
245 final List<IRI> argIRIs = Lists.newArrayList();
246 final Term roleHead = NAFUtils.extractHead(document, role.getSpan());
247 if (roleHead != null) {
248 final Set<Term> argHeads = Sets.newHashSet();
249 for (final Term term : document.getTermsByDepAncestors(
250 Collections.singleton(roleHead), PARTICIPATION_REGEX)) {
251 final String pos = term.getMorphofeat();
252 if (pos.startsWith("NN") || pos.startsWith("VB") || pos.startsWith("JJ")
253 || pos.startsWith("RB") || pos.startsWith("PRP") || pos.startsWith("WP")) {
254 argHeads.add(NAFUtils.syntacticToSRLHead(document, term));
255 }
256 }
257 for (final Term argHead : argHeads) {
258 argIRIs.add(termIRIs.get(argHead));
259 }
260 Collections.sort(argIRIs, Statements.valueComparator());
261 }
262
263 for (final IRI roleIRI : Ordering.from(Statements.valueComparator()).sortedCopy(
264 roleIRIs)) {
265 for (final IRI argIRI : argIRIs) {
266 handler.handleStatement(Statements.VALUE_FACTORY.createStatement(predIRI, roleIRI, argIRI));
267 }
268 }
269 }
270 }
271
272 handler.endRDF();
273 }
274
275 private static Map<Term, IRI> getIRIs(final List<Term> terms, final String ns) {
276
277 final List<String> termStrings = Lists.newArrayList();
278 for (final Term term : terms) {
279 String s = term.getStr().toLowerCase();
280 for (int i = 0; i < s.length(); ++i) {
281 final char c = s.charAt(i);
282 if (!(c >= 'a' && c <= 'z') && !(c >= 'A' && c <= 'Z') && !(c >= '0' && c <= '9')
283 && c != '-' && c != '_') {
284 s = s.substring(0, i);
285 break;
286 }
287 }
288 termStrings.add(s);
289 }
290
291 final Map<Term, IRI> uris = Maps.newHashMap();
292 for (int i = 0; i < terms.size(); ++i) {
293 final String is = termStrings.get(i);
294 int index = 0;
295 int count = 0;
296 for (int j = 0; j < terms.size(); ++j) {
297 if (j == i) {
298 index = count;
299 }
300 if (termStrings.get(j).equals(is)) {
301 ++count;
302 }
303 }
304 final String id = count <= 1 ? is : is + "_" + (index + 1);
305 uris.put(terms.get(i), Statements.VALUE_FACTORY.createIRI(ns + id));
306 }
307 return uris;
308 }
309
310 private static String getText(final List<Term> terms) {
311 final StringBuilder builder = new StringBuilder();
312 boolean atBeginning = true;
313 for (final Term term : terms) {
314 for (final WF word : term.getWFs()) {
315 final String s = word.getForm();
316 final boolean punct = ",".equals(s) || ";".equals(s) || ".".equals(s)
317 || ":".equals(s);
318 builder.append(atBeginning || punct ? "" : " ");
319 builder.append(word.getForm());
320 atBeginning = false;
321 }
322 }
323 return builder.toString();
324 }
325
326 }