1   package eu.fbk.dkm.pikes.rdf;
2   
3   import java.io.File;
4   import java.io.Reader;
5   import java.io.StringWriter;
6   import java.io.Writer;
7   import java.util.Collections;
8   import java.util.List;
9   import java.util.Map;
10  import java.util.Set;
11  
12  import com.google.common.base.Charsets;
13  import com.google.common.base.Strings;
14  import com.google.common.collect.HashMultimap;
15  import com.google.common.collect.Lists;
16  import com.google.common.collect.Maps;
17  import com.google.common.collect.Multimap;
18  import com.google.common.collect.Ordering;
19  import com.google.common.collect.Sets;
20  import com.google.common.io.Files;
21  
22  import org.eclipse.rdf4j.model.IRI;
23  import org.eclipse.rdf4j.model.vocabulary.OWL;
24  import org.eclipse.rdf4j.model.vocabulary.RDF;
25  import org.eclipse.rdf4j.model.vocabulary.RDFS;
26  import org.eclipse.rdf4j.rio.RDFFormat;
27  import org.eclipse.rdf4j.rio.RDFHandler;
28  import org.eclipse.rdf4j.rio.RDFHandlerException;
29  import org.eclipse.rdf4j.rio.RDFWriter;
30  import org.eclipse.rdf4j.rio.Rio;
31  import org.slf4j.Logger;
32  import org.slf4j.LoggerFactory;
33  
34  import ixa.kaflib.Coref;
35  import ixa.kaflib.Entity;
36  import ixa.kaflib.ExternalRef;
37  import ixa.kaflib.KAFDocument;
38  import ixa.kaflib.LinkedEntity;
39  import ixa.kaflib.Predicate;
40  import ixa.kaflib.Predicate.Role;
41  import ixa.kaflib.Span;
42  import ixa.kaflib.Term;
43  import ixa.kaflib.WF;
44  
45  import eu.fbk.dkm.pikes.resources.NAFUtils;
46  import eu.fbk.rdfpro.util.IO;
47  import eu.fbk.rdfpro.util.Statements;
48  
49  public class AnnotationHelper {
50  
51      private static final Logger LOGGER = LoggerFactory.getLogger(AnnotationHelper.class);
52  
53      private static final String NS_NB = "http://pikes.fbk.eu/ontologies/nombank#";
54  
55      private static final String NS_PB = "http://pikes.fbk.eu/ontologies/propbank#";
56  
57      private static final String NS_VN = "http://pikes.fbk.eu/ontologies/verbnet#";
58  
59      private static final String NS_FN = "http://pikes.fbk.eu/ontologies/framenet#";
60  
61      private static final String PARTICIPATION_REGEX = ""
62              + "SUB? (COORD CONJ?)* (PMOD (COORD CONJ?)*)? ((VC OPRD?)|(IM OPRD?))*";
63  
64      public static void main(final String... args) {
65  
66          for (final String arg : args) {
67  
68              LOGGER.info("Processing {} ...", arg);
69  
70              final String lcaseName = arg.toLowerCase();
71              int index = lcaseName.lastIndexOf(".naf");
72              if (index < 0) {
73                  index = lcaseName.lastIndexOf(".xml");
74                  if (index < 0) {
75                      index = lcaseName.length();
76                  }
77              }
78              final String prefix = arg.substring(0, index);
79  
80              try (Reader reader = IO.utf8Reader(IO.read(arg))) {
81                  final KAFDocument document = KAFDocument.createFromStream(reader);
82                  for (int i = 1; i <= document.getNumSentences(); ++i) {
83                      final Writer writer = new StringWriter();
84                      final RDFWriter rdfWriter = Rio.createWriter(RDFFormat.TURTLE, writer);
85                      process(document, i, rdfWriter);
86                      String rdf = writer.toString();
87                      rdf = rdf.replace("\n\n", "\n");
88                      rdf = rdf.replace("# \n", "\n");
89                      Files.write(rdf, new File(prefix + "." + i + ".ttl"), Charsets.UTF_8);
90                  }
91              } catch (final Throwable ex) {
92                  LOGGER.error("Failed to process " + arg, ex);
93              }
94          }
95          LOGGER.info("Done");
96      }
97  
98      public static void process(final KAFDocument document, final int sentence,
99              final RDFHandler handler) throws RDFHandlerException {
100 
101         final List<Term> terms = document.getSentenceTerms(sentence);
102         final String text = getText(terms);
103 
104         final String ns = document.getPublic().uri + "." + sentence + "#";
105         final Map<Term, IRI> termIRIs = getIRIs(terms, ns);
106 
107         handler.startRDF();
108         handler.handleNamespace("rdfs", RDFS.NAMESPACE);
109         handler.handleNamespace("owl", OWL.NAMESPACE);
110         handler.handleNamespace("dbpedia", "http://dbpedia.org/resource/");
111         handler.handleNamespace("pb", NS_PB);
112         handler.handleNamespace("nb", NS_NB);
113         handler.handleNamespace("vn", NS_VN);
114         handler.handleNamespace("fn", NS_FN);
115         handler.handleNamespace("", ns);
116 
117         handler.handleComment("");
118         handler.handleComment("");
119         handler.handleComment("=== TEXT ===");
120         handler.handleComment("");
121 
122         handler.handleStatement(Statements.VALUE_FACTORY.createStatement(Statements.VALUE_FACTORY.createIRI(ns), RDFS.LABEL, Statements.VALUE_FACTORY.createLiteral(
123                 "\n\t" + text)));
124 
125         handler.handleComment("");
126         handler.handleComment("");
127         handler.handleComment("=== COREFERENCE ===");
128 
129         for (final Coref coref : document.getCorefs()) {
130             final List<Span<Term>> spans = Lists.newArrayList();
131             for (final Span<Term> span : coref.getSpans()) {
132                 if (span.getFirstTarget().getSent() == sentence) {
133                     spans.add(span);
134                 }
135             }
136             if (spans.size() > 1) {
137                 final StringBuilder builder = new StringBuilder();
138                 int index = 0;
139                 for (final Span<Term> span : spans) {
140                     builder.append(index == 0 ? "" : "  ").append("span").append(++index)
141                             .append("='").append(getText(span.getTargets())).append("'");
142                 }
143                 handler.handleComment("");
144                 handler.handleComment(builder.toString());
145                 final List<IRI> headIRIs = Lists.newArrayList();
146                 for (final Span<Term> span : spans) {
147                     final Term head = NAFUtils.extractHead(document, span);
148                     headIRIs.add(termIRIs.get(head));
149                 }
150                 Collections.sort(headIRIs, Statements.valueComparator());
151                 for (int i = 1; i < headIRIs.size(); ++i) {
152                     handler.handleStatement(Statements.VALUE_FACTORY.createStatement(headIRIs.get(0), OWL.SAMEAS,
153                             headIRIs.get(i)));
154                 }
155             }
156         }
157 
158         handler.handleComment("");
159         handler.handleComment("");
160         handler.handleComment("=== LINKING ===");
161         handler.handleComment("");
162 
163         final Multimap<IRI, IRI> links = HashMultimap.create();
164         for (final LinkedEntity entity : document.getLinkedEntities()) {
165             if (entity.getWFs().getFirstTarget().getSent() == sentence) {
166                 final Span<Term> span = KAFDocument.newTermSpan(document.getTermsByWFs(entity
167                         .getWFs().getTargets()));
168                 final Term head = NAFUtils.extractHead(document, span);
169                 links.put(termIRIs.get(head), Statements.VALUE_FACTORY.createIRI(entity.getReference()));
170             }
171         }
172         for (final Entity entity : document.getEntities()) {
173             if (entity.getSpans().get(0).getFirstTarget().getSent() == sentence) {
174                 final Term head = NAFUtils.extractHead(document, entity.getSpans().get(0));
175                 for (final ExternalRef ref : entity.getExternalRefs()) {
176                     if (ref.getResource().toLowerCase().contains("spotlight")) {
177                         links.put(termIRIs.get(head), Statements.VALUE_FACTORY.createIRI(ref.getReference()));
178                     }
179                 }
180             }
181         }
182         for (final IRI termIRI : Ordering.from(Statements.valueComparator()).sortedCopy(
183                 links.keySet())) {
184             for (final IRI linkIRI : Ordering.from(Statements.valueComparator()).sortedCopy(
185                     links.get(termIRI))) {
186                 handler.handleStatement(Statements.VALUE_FACTORY.createStatement(termIRI, OWL.SAMEAS, linkIRI));
187             }
188         }
189 
190         handler.handleComment("");
191         handler.handleComment("");
192         handler.handleComment("=== FRAMES ===");
193 
194         for (final Predicate pred : document.getPredicatesBySent(sentence)) {
195 
196             final Term predTerm = NAFUtils.extractHead(document, pred.getSpan());
197             final IRI predIRI = termIRIs.get(predTerm);
198 
199             final StringBuilder builder = new StringBuilder();
200             builder.append("pred='").append(predTerm.getStr()).append("'");
201             for (final Role role : pred.getRoles()) {
202                 builder.append("  ").append(role.getSemRole()).append("='")
203                         .append(getText(role.getTerms())).append("'");
204             }
205             handler.handleComment("");
206             handler.handleComment(builder.toString());
207 
208             final List<IRI> typeIRIs = Lists.newArrayList();
209             for (final ExternalRef ref : pred.getExternalRefs()) {
210                 if (Strings.isNullOrEmpty(ref.getReference())) {
211                     continue;
212                 } else if (NAFUtils.RESOURCE_PROPBANK.equals(ref.getResource())) {
213                     typeIRIs.add(Statements.VALUE_FACTORY.createIRI(NS_PB + ref.getReference()));
214                 } else if (NAFUtils.RESOURCE_NOMBANK.equals(ref.getResource())) {
215                     typeIRIs.add(Statements.VALUE_FACTORY.createIRI(NS_NB + ref.getReference()));
216                 } else if (NAFUtils.RESOURCE_VERBNET.equals(ref.getResource())) {
217                     typeIRIs.add(Statements.VALUE_FACTORY.createIRI(NS_VN + ref.getReference()));
218                 } else if (NAFUtils.RESOURCE_FRAMENET.equals(ref.getResource())) {
219                     typeIRIs.add(Statements.VALUE_FACTORY.createIRI(NS_FN + ref.getReference()));
220                 }
221             }
222             Collections.sort(typeIRIs, Statements.valueComparator());
223             for (final IRI typeIRI : typeIRIs) {
224                 handler.handleStatement(Statements.VALUE_FACTORY.createStatement(predIRI, RDF.TYPE, typeIRI));
225             }
226 
227             for (final Role role : pred.getRoles()) {
228 
229                 final Set<IRI> roleIRIs = Sets.newHashSet();
230                 roleIRIs.add(Statements.VALUE_FACTORY.createIRI((predTerm.getMorphofeat().startsWith("VB") ? NS_PB
231                         : NS_NB) + role.getSemRole().toLowerCase()));
232                 for (final ExternalRef ref : role.getExternalRefs()) {
233                     String id = ref.getReference().toLowerCase();
234                     final int index = id.lastIndexOf('@');
235                     id = index < 0 ? id : id.substring(index + 1);
236                     if (Strings.isNullOrEmpty(id)) {
237                         continue;
238                     } else if (NAFUtils.RESOURCE_VERBNET.equals(ref.getResource())) {
239                         roleIRIs.add(Statements.VALUE_FACTORY.createIRI(NS_VN + id));
240                     } else if (NAFUtils.RESOURCE_FRAMENET.equals(ref.getResource())) {
241                         roleIRIs.add(Statements.VALUE_FACTORY.createIRI(NS_FN + id));
242                     }
243                 }
244 
245                 final List<IRI> argIRIs = Lists.newArrayList();
246                 final Term roleHead = NAFUtils.extractHead(document, role.getSpan());
247                 if (roleHead != null) {
248                     final Set<Term> argHeads = Sets.newHashSet();
249                     for (final Term term : document.getTermsByDepAncestors(
250                             Collections.singleton(roleHead), PARTICIPATION_REGEX)) {
251                         final String pos = term.getMorphofeat();
252                         if (pos.startsWith("NN") || pos.startsWith("VB") || pos.startsWith("JJ")
253                                 || pos.startsWith("RB") || pos.startsWith("PRP") || pos.startsWith("WP")) {
254                             argHeads.add(NAFUtils.syntacticToSRLHead(document, term));
255                         }
256                     }
257                     for (final Term argHead : argHeads) {
258                         argIRIs.add(termIRIs.get(argHead));
259                     }
260                     Collections.sort(argIRIs, Statements.valueComparator());
261                 }
262 
263                 for (final IRI roleIRI : Ordering.from(Statements.valueComparator()).sortedCopy(
264                         roleIRIs)) {
265                     for (final IRI argIRI : argIRIs) {
266                         handler.handleStatement(Statements.VALUE_FACTORY.createStatement(predIRI, roleIRI, argIRI));
267                     }
268                 }
269             }
270         }
271 
272         handler.endRDF();
273     }
274 
275     private static Map<Term, IRI> getIRIs(final List<Term> terms, final String ns) {
276 
277         final List<String> termStrings = Lists.newArrayList();
278         for (final Term term : terms) {
279             String s = term.getStr().toLowerCase();
280             for (int i = 0; i < s.length(); ++i) {
281                 final char c = s.charAt(i);
282                 if (!(c >= 'a' && c <= 'z') && !(c >= 'A' && c <= 'Z') && !(c >= '0' && c <= '9')
283                         && c != '-' && c != '_') {
284                     s = s.substring(0, i);
285                     break;
286                 }
287             }
288             termStrings.add(s);
289         }
290 
291         final Map<Term, IRI> uris = Maps.newHashMap();
292         for (int i = 0; i < terms.size(); ++i) {
293             final String is = termStrings.get(i);
294             int index = 0;
295             int count = 0;
296             for (int j = 0; j < terms.size(); ++j) {
297                 if (j == i) {
298                     index = count;
299                 }
300                 if (termStrings.get(j).equals(is)) {
301                     ++count;
302                 }
303             }
304             final String id = count <= 1 ? is : is + "_" + (index + 1);
305             uris.put(terms.get(i), Statements.VALUE_FACTORY.createIRI(ns + id));
306         }
307         return uris;
308     }
309 
310     private static String getText(final List<Term> terms) {
311         final StringBuilder builder = new StringBuilder();
312         boolean atBeginning = true;
313         for (final Term term : terms) {
314             for (final WF word : term.getWFs()) {
315                 final String s = word.getForm();
316                 final boolean punct = ",".equals(s) || ";".equals(s) || ".".equals(s)
317                         || ":".equals(s);
318                 builder.append(atBeginning || punct ? "" : " ");
319                 builder.append(word.getForm());
320                 atBeginning = false;
321             }
322         }
323         return builder.toString();
324     }
325 
326 }