1 package eu.fbk.dkm.pikes.rdf; 2 3 import java.io.File; 4 import java.io.Reader; 5 import java.io.StringWriter; 6 import java.io.Writer; 7 import java.util.Collections; 8 import java.util.List; 9 import java.util.Map; 10 import java.util.Set; 11 12 import com.google.common.base.Charsets; 13 import com.google.common.base.Strings; 14 import com.google.common.collect.HashMultimap; 15 import com.google.common.collect.Lists; 16 import com.google.common.collect.Maps; 17 import com.google.common.collect.Multimap; 18 import com.google.common.collect.Ordering; 19 import com.google.common.collect.Sets; 20 import com.google.common.io.Files; 21 22 import org.eclipse.rdf4j.model.IRI; 23 import org.eclipse.rdf4j.model.vocabulary.OWL; 24 import org.eclipse.rdf4j.model.vocabulary.RDF; 25 import org.eclipse.rdf4j.model.vocabulary.RDFS; 26 import org.eclipse.rdf4j.rio.RDFFormat; 27 import org.eclipse.rdf4j.rio.RDFHandler; 28 import org.eclipse.rdf4j.rio.RDFHandlerException; 29 import org.eclipse.rdf4j.rio.RDFWriter; 30 import org.eclipse.rdf4j.rio.Rio; 31 import org.slf4j.Logger; 32 import org.slf4j.LoggerFactory; 33 34 import ixa.kaflib.Coref; 35 import ixa.kaflib.Entity; 36 import ixa.kaflib.ExternalRef; 37 import ixa.kaflib.KAFDocument; 38 import ixa.kaflib.LinkedEntity; 39 import ixa.kaflib.Predicate; 40 import ixa.kaflib.Predicate.Role; 41 import ixa.kaflib.Span; 42 import ixa.kaflib.Term; 43 import ixa.kaflib.WF; 44 45 import eu.fbk.dkm.pikes.resources.NAFUtils; 46 import eu.fbk.rdfpro.util.IO; 47 import eu.fbk.rdfpro.util.Statements; 48 49 public class AnnotationHelper { 50 51 private static final Logger LOGGER = LoggerFactory.getLogger(AnnotationHelper.class); 52 53 private static final String NS_NB = "http://pikes.fbk.eu/ontologies/nombank#"; 54 55 private static final String NS_PB = "http://pikes.fbk.eu/ontologies/propbank#"; 56 57 private static final String NS_VN = "http://pikes.fbk.eu/ontologies/verbnet#"; 58 59 private static final String NS_FN = "http://pikes.fbk.eu/ontologies/framenet#"; 60 61 private static final String PARTICIPATION_REGEX = "" 62 + "SUB? (COORD CONJ?)* (PMOD (COORD CONJ?)*)? ((VC OPRD?)|(IM OPRD?))*"; 63 64 public static void main(final String... args) { 65 66 for (final String arg : args) { 67 68 LOGGER.info("Processing {} ...", arg); 69 70 final String lcaseName = arg.toLowerCase(); 71 int index = lcaseName.lastIndexOf(".naf"); 72 if (index < 0) { 73 index = lcaseName.lastIndexOf(".xml"); 74 if (index < 0) { 75 index = lcaseName.length(); 76 } 77 } 78 final String prefix = arg.substring(0, index); 79 80 try (Reader reader = IO.utf8Reader(IO.read(arg))) { 81 final KAFDocument document = KAFDocument.createFromStream(reader); 82 for (int i = 1; i <= document.getNumSentences(); ++i) { 83 final Writer writer = new StringWriter(); 84 final RDFWriter rdfWriter = Rio.createWriter(RDFFormat.TURTLE, writer); 85 process(document, i, rdfWriter); 86 String rdf = writer.toString(); 87 rdf = rdf.replace("\n\n", "\n"); 88 rdf = rdf.replace("# \n", "\n"); 89 Files.write(rdf, new File(prefix + "." + i + ".ttl"), Charsets.UTF_8); 90 } 91 } catch (final Throwable ex) { 92 LOGGER.error("Failed to process " + arg, ex); 93 } 94 } 95 LOGGER.info("Done"); 96 } 97 98 public static void process(final KAFDocument document, final int sentence, 99 final RDFHandler handler) throws RDFHandlerException { 100 101 final List<Term> terms = document.getSentenceTerms(sentence); 102 final String text = getText(terms); 103 104 final String ns = document.getPublic().uri + "." + sentence + "#"; 105 final Map<Term, IRI> termIRIs = getIRIs(terms, ns); 106 107 handler.startRDF(); 108 handler.handleNamespace("rdfs", RDFS.NAMESPACE); 109 handler.handleNamespace("owl", OWL.NAMESPACE); 110 handler.handleNamespace("dbpedia", "http://dbpedia.org/resource/"); 111 handler.handleNamespace("pb", NS_PB); 112 handler.handleNamespace("nb", NS_NB); 113 handler.handleNamespace("vn", NS_VN); 114 handler.handleNamespace("fn", NS_FN); 115 handler.handleNamespace("", ns); 116 117 handler.handleComment(""); 118 handler.handleComment(""); 119 handler.handleComment("=== TEXT ==="); 120 handler.handleComment(""); 121 122 handler.handleStatement(Statements.VALUE_FACTORY.createStatement(Statements.VALUE_FACTORY.createIRI(ns), RDFS.LABEL, Statements.VALUE_FACTORY.createLiteral( 123 "\n\t" + text))); 124 125 handler.handleComment(""); 126 handler.handleComment(""); 127 handler.handleComment("=== COREFERENCE ==="); 128 129 for (final Coref coref : document.getCorefs()) { 130 final List<Span<Term>> spans = Lists.newArrayList(); 131 for (final Span<Term> span : coref.getSpans()) { 132 if (span.getFirstTarget().getSent() == sentence) { 133 spans.add(span); 134 } 135 } 136 if (spans.size() > 1) { 137 final StringBuilder builder = new StringBuilder(); 138 int index = 0; 139 for (final Span<Term> span : spans) { 140 builder.append(index == 0 ? "" : " ").append("span").append(++index) 141 .append("='").append(getText(span.getTargets())).append("'"); 142 } 143 handler.handleComment(""); 144 handler.handleComment(builder.toString()); 145 final List<IRI> headIRIs = Lists.newArrayList(); 146 for (final Span<Term> span : spans) { 147 final Term head = NAFUtils.extractHead(document, span); 148 headIRIs.add(termIRIs.get(head)); 149 } 150 Collections.sort(headIRIs, Statements.valueComparator()); 151 for (int i = 1; i < headIRIs.size(); ++i) { 152 handler.handleStatement(Statements.VALUE_FACTORY.createStatement(headIRIs.get(0), OWL.SAMEAS, 153 headIRIs.get(i))); 154 } 155 } 156 } 157 158 handler.handleComment(""); 159 handler.handleComment(""); 160 handler.handleComment("=== LINKING ==="); 161 handler.handleComment(""); 162 163 final Multimap<IRI, IRI> links = HashMultimap.create(); 164 for (final LinkedEntity entity : document.getLinkedEntities()) { 165 if (entity.getWFs().getFirstTarget().getSent() == sentence) { 166 final Span<Term> span = KAFDocument.newTermSpan(document.getTermsByWFs(entity 167 .getWFs().getTargets())); 168 final Term head = NAFUtils.extractHead(document, span); 169 links.put(termIRIs.get(head), Statements.VALUE_FACTORY.createIRI(entity.getReference())); 170 } 171 } 172 for (final Entity entity : document.getEntities()) { 173 if (entity.getSpans().get(0).getFirstTarget().getSent() == sentence) { 174 final Term head = NAFUtils.extractHead(document, entity.getSpans().get(0)); 175 for (final ExternalRef ref : entity.getExternalRefs()) { 176 if (ref.getResource().toLowerCase().contains("spotlight")) { 177 links.put(termIRIs.get(head), Statements.VALUE_FACTORY.createIRI(ref.getReference())); 178 } 179 } 180 } 181 } 182 for (final IRI termIRI : Ordering.from(Statements.valueComparator()).sortedCopy( 183 links.keySet())) { 184 for (final IRI linkIRI : Ordering.from(Statements.valueComparator()).sortedCopy( 185 links.get(termIRI))) { 186 handler.handleStatement(Statements.VALUE_FACTORY.createStatement(termIRI, OWL.SAMEAS, linkIRI)); 187 } 188 } 189 190 handler.handleComment(""); 191 handler.handleComment(""); 192 handler.handleComment("=== FRAMES ==="); 193 194 for (final Predicate pred : document.getPredicatesBySent(sentence)) { 195 196 final Term predTerm = NAFUtils.extractHead(document, pred.getSpan()); 197 final IRI predIRI = termIRIs.get(predTerm); 198 199 final StringBuilder builder = new StringBuilder(); 200 builder.append("pred='").append(predTerm.getStr()).append("'"); 201 for (final Role role : pred.getRoles()) { 202 builder.append(" ").append(role.getSemRole()).append("='") 203 .append(getText(role.getTerms())).append("'"); 204 } 205 handler.handleComment(""); 206 handler.handleComment(builder.toString()); 207 208 final List<IRI> typeIRIs = Lists.newArrayList(); 209 for (final ExternalRef ref : pred.getExternalRefs()) { 210 if (Strings.isNullOrEmpty(ref.getReference())) { 211 continue; 212 } else if (NAFUtils.RESOURCE_PROPBANK.equals(ref.getResource())) { 213 typeIRIs.add(Statements.VALUE_FACTORY.createIRI(NS_PB + ref.getReference())); 214 } else if (NAFUtils.RESOURCE_NOMBANK.equals(ref.getResource())) { 215 typeIRIs.add(Statements.VALUE_FACTORY.createIRI(NS_NB + ref.getReference())); 216 } else if (NAFUtils.RESOURCE_VERBNET.equals(ref.getResource())) { 217 typeIRIs.add(Statements.VALUE_FACTORY.createIRI(NS_VN + ref.getReference())); 218 } else if (NAFUtils.RESOURCE_FRAMENET.equals(ref.getResource())) { 219 typeIRIs.add(Statements.VALUE_FACTORY.createIRI(NS_FN + ref.getReference())); 220 } 221 } 222 Collections.sort(typeIRIs, Statements.valueComparator()); 223 for (final IRI typeIRI : typeIRIs) { 224 handler.handleStatement(Statements.VALUE_FACTORY.createStatement(predIRI, RDF.TYPE, typeIRI)); 225 } 226 227 for (final Role role : pred.getRoles()) { 228 229 final Set<IRI> roleIRIs = Sets.newHashSet(); 230 roleIRIs.add(Statements.VALUE_FACTORY.createIRI((predTerm.getMorphofeat().startsWith("VB") ? NS_PB 231 : NS_NB) + role.getSemRole().toLowerCase())); 232 for (final ExternalRef ref : role.getExternalRefs()) { 233 String id = ref.getReference().toLowerCase(); 234 final int index = id.lastIndexOf('@'); 235 id = index < 0 ? id : id.substring(index + 1); 236 if (Strings.isNullOrEmpty(id)) { 237 continue; 238 } else if (NAFUtils.RESOURCE_VERBNET.equals(ref.getResource())) { 239 roleIRIs.add(Statements.VALUE_FACTORY.createIRI(NS_VN + id)); 240 } else if (NAFUtils.RESOURCE_FRAMENET.equals(ref.getResource())) { 241 roleIRIs.add(Statements.VALUE_FACTORY.createIRI(NS_FN + id)); 242 } 243 } 244 245 final List<IRI> argIRIs = Lists.newArrayList(); 246 final Term roleHead = NAFUtils.extractHead(document, role.getSpan()); 247 if (roleHead != null) { 248 final Set<Term> argHeads = Sets.newHashSet(); 249 for (final Term term : document.getTermsByDepAncestors( 250 Collections.singleton(roleHead), PARTICIPATION_REGEX)) { 251 final String pos = term.getMorphofeat(); 252 if (pos.startsWith("NN") || pos.startsWith("VB") || pos.startsWith("JJ") 253 || pos.startsWith("RB") || pos.startsWith("PRP") || pos.startsWith("WP")) { 254 argHeads.add(NAFUtils.syntacticToSRLHead(document, term)); 255 } 256 } 257 for (final Term argHead : argHeads) { 258 argIRIs.add(termIRIs.get(argHead)); 259 } 260 Collections.sort(argIRIs, Statements.valueComparator()); 261 } 262 263 for (final IRI roleIRI : Ordering.from(Statements.valueComparator()).sortedCopy( 264 roleIRIs)) { 265 for (final IRI argIRI : argIRIs) { 266 handler.handleStatement(Statements.VALUE_FACTORY.createStatement(predIRI, roleIRI, argIRI)); 267 } 268 } 269 } 270 } 271 272 handler.endRDF(); 273 } 274 275 private static Map<Term, IRI> getIRIs(final List<Term> terms, final String ns) { 276 277 final List<String> termStrings = Lists.newArrayList(); 278 for (final Term term : terms) { 279 String s = term.getStr().toLowerCase(); 280 for (int i = 0; i < s.length(); ++i) { 281 final char c = s.charAt(i); 282 if (!(c >= 'a' && c <= 'z') && !(c >= 'A' && c <= 'Z') && !(c >= '0' && c <= '9') 283 && c != '-' && c != '_') { 284 s = s.substring(0, i); 285 break; 286 } 287 } 288 termStrings.add(s); 289 } 290 291 final Map<Term, IRI> uris = Maps.newHashMap(); 292 for (int i = 0; i < terms.size(); ++i) { 293 final String is = termStrings.get(i); 294 int index = 0; 295 int count = 0; 296 for (int j = 0; j < terms.size(); ++j) { 297 if (j == i) { 298 index = count; 299 } 300 if (termStrings.get(j).equals(is)) { 301 ++count; 302 } 303 } 304 final String id = count <= 1 ? is : is + "_" + (index + 1); 305 uris.put(terms.get(i), Statements.VALUE_FACTORY.createIRI(ns + id)); 306 } 307 return uris; 308 } 309 310 private static String getText(final List<Term> terms) { 311 final StringBuilder builder = new StringBuilder(); 312 boolean atBeginning = true; 313 for (final Term term : terms) { 314 for (final WF word : term.getWFs()) { 315 final String s = word.getForm(); 316 final boolean punct = ",".equals(s) || ";".equals(s) || ".".equals(s) 317 || ":".equals(s); 318 builder.append(atBeginning || punct ? "" : " "); 319 builder.append(word.getForm()); 320 atBeginning = false; 321 } 322 } 323 return builder.toString(); 324 } 325 326 }