1   package eu.fbk.dkm.pikes.rdf.util;
2   
3   import com.google.common.collect.*;
4   import eu.fbk.dkm.pikes.rdf.vocab.KS_OLD;
5   import eu.fbk.dkm.pikes.rdf.vocab.NIF;
6   import eu.fbk.dkm.pikes.rdf.vocab.GAF;
7   import eu.fbk.rdfpro.util.QuadModel;
8   import eu.fbk.rdfpro.util.Statements;
9   import org.eclipse.rdf4j.model.IRI;
10  import org.eclipse.rdf4j.model.*;
11  import org.eclipse.rdf4j.model.vocabulary.RDF;
12  
13  import javax.annotation.Nullable;
14  import java.util.List;
15  import java.util.Locale;
16  import java.util.Map;
17  import java.util.Set;
18  
19  // TODO: define RDFModel (quad extension of Model) and KSModel (with methods specific to KS
20  // schema)
21  
22  
23  public final class ModelUtil {
24  
25      private static final Map<String, IRI> LANGUAGE_CODES_TO_IRIS;
26  
27      private static final Map<IRI, String> LANGUAGE_IRIS_TO_CODES;
28  
29      static {
30          final Map<String, IRI> codesToIRIs = Maps.newHashMap();
31          final Map<IRI, String> urisToCodes = Maps.newHashMap();
32          for (final String language : Locale.getISOLanguages()) {
33              final Locale locale = new Locale(language);
34              final IRI uri = Statements.VALUE_FACTORY.createIRI("http://lexvo.org/id/iso639-3/",
35                      locale.getISO3Language());
36              codesToIRIs.put(language, uri);
37              urisToCodes.put(uri, language);
38          }
39          LANGUAGE_CODES_TO_IRIS = ImmutableMap.copyOf(codesToIRIs);
40          LANGUAGE_IRIS_TO_CODES = ImmutableMap.copyOf(urisToCodes);
41      }
42  
43      public static Set<Resource> getMentions(final QuadModel model) {
44          return model.filter(null, RDF.TYPE, KS_OLD.MENTION).subjects();
45      }
46  
47      public static Set<Resource> getMentions(final QuadModel model, final int beginIndex,
48              final int endIndex) {
49          final List<Resource> mentionIDs = Lists.newArrayList();
50          for (final Resource mentionID : model.filter(null, RDF.TYPE, KS_OLD.MENTION).subjects()) {
51              final Literal begin = model.filter(mentionID, NIF.BEGIN_INDEX, null).objectLiteral();
52              final Literal end = model.filter(mentionID, NIF.END_INDEX, null).objectLiteral();
53              if (begin != null && begin.intValue() >= beginIndex && end != null
54                      && end.intValue() <= endIndex) {
55                  mentionIDs.add(mentionID);
56              }
57          }
58          return ImmutableSet.copyOf(mentionIDs);
59      }
60  
61      public static QuadModel getSubModel(final QuadModel model,
62              final Iterable<? extends Resource> mentionIDs) {
63  
64          final QuadModel result = QuadModel.create();
65          final Set<Resource> nodes = Sets.newHashSet();
66  
67          // Add all the triples (i) describing the mention; (ii) linking the mention to denoted
68          // entities or expressed facts; (iii) describing expressed facts; (iv) expressed by the
69          // mention; and (v) reachable by added resources and not expressed by some mention
70          for (final Resource mentionID : mentionIDs) {
71              result.addAll(model.filter(mentionID, null, null));
72              for (final Statement triple : model.filter(null, null, mentionID)) {
73                  result.add(triple);
74                  if (triple.getPredicate().equals(KS_OLD.EXPRESSED_BY)) {
75                      final Resource factID = triple.getSubject();
76                      result.addAll(model.filter(factID, null, null));
77                      for (final Statement factTriple : model.filter(null, null, null, factID)) {
78                          result.add(factTriple);
79                          final Resource factSubj = factTriple.getSubject();
80                          final IRI factPred = factTriple.getPredicate();
81                          final Value factObj = factTriple.getObject();
82                          nodes.add(factSubj);
83                          if (factObj instanceof Resource && !factPred.equals(GAF.DENOTED_BY)) {
84                              nodes.add((Resource) factObj);
85                          }
86                      }
87                  } else {
88                      nodes.add(triple.getSubject());
89                  }
90              }
91          }
92  
93          // Add all the triples not linked to some mention rooted at some node previously extracted
94          final List<Resource> queue = Lists.newLinkedList(nodes);
95          while (!queue.isEmpty()) {
96              final Resource node = queue.remove(0);
97              for (final Statement triple : model.filter(node, null, null)) {
98                  if (triple.getContext() != null) {
99                      final Resource context = triple.getContext();
100                     if (model.filter(context, KS_OLD.EXPRESSED_BY, null).isEmpty()) {
101                         result.add(triple);
102                         if (triple.getObject() instanceof Resource) {
103                             final Resource obj = (Resource) triple.getObject();
104                             if (nodes.add(obj)) {
105                                 queue.add(obj);
106                             }
107                         }
108                     }
109                 }
110             }
111         }
112         return result;
113     }
114 
115     public static IRI languageCodeToIRI(@Nullable final String code)
116             throws IllegalArgumentException {
117         if (code == null) {
118             return null;
119         }
120         final int length = code.length();
121         if (length == 2) {
122             final IRI uri = LANGUAGE_CODES_TO_IRIS.get(code);
123             if (uri != null) {
124                 return uri;
125             }
126         } else if (length == 3) {
127             final IRI uri = Statements.VALUE_FACTORY.createIRI("http://lexvo.org/id/iso639-3/"
128                     + code);
129             if (LANGUAGE_IRIS_TO_CODES.containsKey(uri)) {
130                 return uri;
131             }
132         }
133         throw new IllegalArgumentException("Invalid language code: " + code);
134     }
135 
136     @Nullable
137     public static String languageIRIToCode(@Nullable final IRI uri)
138             throws IllegalArgumentException {
139         if (uri == null) {
140             return null;
141         }
142         final String code = LANGUAGE_IRIS_TO_CODES.get(uri);
143         if (code != null) {
144             return code;
145         }
146         throw new IllegalArgumentException("Invalid language IRI: " + uri);
147     }
148 
149     /**
150      * Clean an illegal IRI string, trying to make it legal (as per RFC 3987).
151      *
152      * @param string
153      *            the IRI string to clean
154      * @return the cleaned IRI string (possibly the input unchanged) upon success
155      * @throws IllegalArgumentException
156      *             in case the supplied input cannot be transformed into a legal IRI
157      */
158     @Nullable
159     public static String cleanIRI(@Nullable final String string) throws IllegalArgumentException {
160 
161         // TODO: we only replace illegal characters, but we should also check and fix the IRI
162         // structure
163 
164         // We implement the cleaning suggestions provided at the following URL (section 'So what
165         // exactly should I do?'), extended to deal with IRIs instead of IRIs:
166         // https://unspecified.wordpress.com/2012/02/12/how-do-you-escape-a-complete-uri/
167 
168         // Handle null input
169         if (string == null) {
170             return null;
171         }
172 
173         // Illegal characters should be percent encoded. Illegal IRI characters are all the
174         // character that are not 'unreserved' (A-Z a-z 0-9 - . _ ~ 0xA0-0xD7FF 0xF900-0xFDCF
175         // 0xFDF0-0xFFEF) or 'reserved' (! # $ % & ' ( ) * + , / : ; = ? @ [ ])
176         final StringBuilder builder = new StringBuilder();
177         for (int i = 0; i < string.length(); ++i) {
178             final char c = string.charAt(i);
179             if (c >= 'a' && c <= 'z' || c >= '?' && c <= '[' || c >= '&' && c <= ';' || c == '#'
180                     || c == '$' || c == '!' || c == '=' || c == ']' || c == '_' || c == '~'
181                     || c >= 0xA0 && c <= 0xD7FF || c >= 0xF900 && c <= 0xFDCF || c >= 0xFDF0
182                     && c <= 0xFFEF) {
183                 builder.append(c);
184             } else if (c == '%' && i < string.length() - 2
185                     && Character.digit(string.charAt(i + 1), 16) >= 0
186                     && Character.digit(string.charAt(i + 2), 16) >= 0) {
187                 builder.append('%'); // preserve valid percent encodings
188             } else {
189                 builder.append('%').append(Character.forDigit(c / 16, 16))
190                         .append(Character.forDigit(c % 16, 16));
191             }
192         }
193 
194         // Return the cleaned IRI (no Java validation as it is an IRI, not a IRI)
195         return builder.toString();
196     }
197 
198 }