1   package eu.fbk.dkm.pikes.resources;
2   
3   import java.io.File;
4   import java.io.Writer;
5   import java.util.Collection;
6   import java.util.List;
7   import java.util.Map;
8   import java.util.Set;
9   
10  import javax.annotation.Nullable;
11  
12  import com.google.common.base.Charsets;
13  import com.google.common.collect.HashMultimap;
14  import com.google.common.collect.ImmutableMap;
15  import com.google.common.collect.Iterables;
16  import com.google.common.collect.Lists;
17  import com.google.common.collect.Maps;
18  import com.google.common.collect.Multimap;
19  import com.google.common.collect.Ordering;
20  import com.google.common.collect.Sets;
21  import com.google.common.io.Resources;
22  
23  import org.eclipse.rdf4j.model.Resource;
24  import org.eclipse.rdf4j.model.Statement;
25  import org.eclipse.rdf4j.model.IRI;
26  import org.eclipse.rdf4j.model.Value;
27  import org.eclipse.rdf4j.model.impl.SimpleValueFactory;
28  import org.eclipse.rdf4j.model.impl.ValueFactoryImpl;
29  import org.eclipse.rdf4j.model.vocabulary.RDFS;
30  import org.eclipse.rdf4j.rio.RDFHandlerException;
31  import org.slf4j.Logger;
32  import org.slf4j.LoggerFactory;
33  
34  import eu.fbk.utils.core.CommandLine;
35  import eu.fbk.utils.core.CommandLine.Type;
36  import eu.fbk.rdfpro.AbstractRDFHandler;
37  import eu.fbk.rdfpro.RDFSource;
38  import eu.fbk.rdfpro.RDFSources;
39  import eu.fbk.rdfpro.tql.TQL;
40  import eu.fbk.rdfpro.util.IO;
41  
42  public final class YagoTaxonomy {
43  
44      public static final String NAMESPACE = "http://dbpedia.org/class/yago/";
45  
46      private static final Map<String, Concept> ID_INDEX;
47  
48      private static final Map<Integer, Concept> OFFSET_INDEX;
49  
50      private static final Logger LOGGER = LoggerFactory.getLogger(YagoTaxonomy.class);
51  
52      static {
53          try {
54              final List<String> ids = Lists.newArrayList();
55              final Map<Integer, String> offsetMap = Maps.newHashMap();
56              final Multimap<Integer, Integer> parentsMap = HashMultimap.create();
57              final Multimap<Integer, Integer> childrenMap = HashMultimap.create();
58              for (final String line : Resources.readLines(
59                      YagoTaxonomy.class.getResource("YagoTaxonomy.tsv"), Charsets.UTF_8)) {
60                  final String[] tokens = line.split("\t");
61                  if (tokens.length > 0) {
62                      final int num = ids.size();
63                      final String id = tokens[0];
64                      ids.add(id);
65                      final int len = id.length();
66                      if (len > 9) {
67                          try {
68                              final int offset = Integer.parseInt(id.substring(len - 8));
69                              offsetMap.put(offset, id);
70                          } catch (final NumberFormatException ex) {
71                              // Ignore
72                          }
73                      }
74                      for (int i = 1; i < tokens.length; ++i) {
75                          final int parentNum = Integer.parseInt(tokens[i]);
76                          parentsMap.put(num, parentNum);
77                          childrenMap.put(parentNum, num);
78                      }
79                  }
80              }
81  
82              final String[] emptyIDs = new String[0];
83              final ImmutableMap.Builder<String, Concept> idIndexBuilder = ImmutableMap.builder();
84              for (int num = 0; num < ids.size(); ++num) {
85                  final String id = ids.get(num);
86                  final Collection<Integer> parentNums = parentsMap.get(num);
87                  final Collection<Integer> childrenNums = childrenMap.get(num);
88                  final int numParents = parentNums.size();
89                  final int numChildren = childrenNums.size();
90                  final String[] parentIDs = numParents == 0 ? emptyIDs : new String[numParents];
91                  final String[] childrenIDs = numChildren == 0 ? emptyIDs : new String[numChildren];
92                  int index = 0;
93                  for (final Integer parentNum : parentNums) {
94                      parentIDs[index++] = ids.get(parentNum);
95                  }
96                  index = 0;
97                  for (final Integer childrenNum : childrenNums) {
98                      childrenIDs[index++] = ids.get(childrenNum);
99                  }
100                 final Concept concept = new Concept(id, parentIDs, childrenIDs);
101                 idIndexBuilder.put(id, concept);
102             }
103             ID_INDEX = idIndexBuilder.build();
104 
105             final ImmutableMap.Builder<Integer, Concept> offsetIndexBuilder = ImmutableMap
106                     .builder();
107             for (final Map.Entry<Integer, String> entry : offsetMap.entrySet()) {
108                 offsetIndexBuilder.put(entry.getKey(), ID_INDEX.get(entry.getValue()));
109             }
110             OFFSET_INDEX = offsetIndexBuilder.build();
111 
112         } catch (final Exception ex) {
113             throw new Error(ex);
114         }
115     }
116 
117     @Nullable
118     public static IRI getDBpediaYagoIRI(@Nullable final String synsetID) {
119         if (synsetID != null) {
120             final Integer offset = Integer.valueOf(synsetID.substring(0, synsetID.length() - 2));
121             final Concept concept = OFFSET_INDEX.get(offset);
122             if (concept != null) {
123                 return SimpleValueFactory.getInstance() .createIRI(NAMESPACE + concept.id);
124             }
125         }
126         return null;
127     }
128 
129     public static Set<IRI> getDBpediaYagoIRIs(@Nullable final Iterable<String> synsetIDs) {
130         final Set<IRI> uris = Sets.newHashSet();
131         final Set<String> hypernyms = Sets.newHashSet();
132         final List<String> queue = Lists.newLinkedList();
133         if (synsetIDs != null) {
134             Iterables.addAll(queue, synsetIDs);
135         }
136         while (!queue.isEmpty()) {
137             final String synsetID = queue.remove(0);
138             final IRI uri = getDBpediaYagoIRI(synsetID);
139             if (uri != null) {
140                 uris.add(uri);
141             } else {
142                 for (final String hypernym : WordNet.getHypernyms(synsetID)) {
143                     if (hypernyms.add(hypernym)) {
144                         queue.add(hypernym);
145                     }
146                 }
147             }
148         }
149         return uris;
150     }
151 
152     @Nullable
153     public static String getSynsetID(@Nullable final IRI dbpediaYagoIRI) {
154         if (dbpediaYagoIRI != null && dbpediaYagoIRI.stringValue().startsWith(NAMESPACE)) {
155             final String s = dbpediaYagoIRI.stringValue();
156             final int l = s.length();
157             if (l > 9) {
158                 for (int i = l - 9; i < l; ++i) {
159                     if (!Character.isDigit(s.charAt(i))) {
160                         return null;
161                     }
162                 }
163                 return s.substring(l - 8) + "-n";
164             }
165         }
166         return null;
167     }
168 
169     public static Set<IRI> getSubClasses(final IRI parentIRI, final boolean recursive) {
170         final Set<IRI> result = Sets.newHashSet();
171         final List<IRI> queue = Lists.newLinkedList();
172         queue.add(parentIRI);
173         while (!queue.isEmpty()) {
174             final IRI uri = queue.remove(0);
175             final String id = uri.stringValue().substring(NAMESPACE.length());
176             final Concept concept = ID_INDEX.get(id);
177             if (concept != null) {
178                 for (final String childID : concept.children) {
179                     final IRI childIRI = SimpleValueFactory.getInstance().createIRI(
180                             NAMESPACE + childID);
181                     if (result.add(childIRI) && recursive) {
182                         queue.add(childIRI);
183                     }
184                 }
185             }
186         }
187         return result;
188     }
189 
190     public static Set<IRI> getSuperClasses(final IRI childIRI, final boolean recursive) {
191         final Set<IRI> result = Sets.newHashSet();
192         final List<IRI> queue = Lists.newLinkedList();
193         queue.add(childIRI);
194         while (!queue.isEmpty()) {
195             final IRI uri = queue.remove(0);
196             final String id = uri.stringValue().substring(NAMESPACE.length());
197             final Concept concept = ID_INDEX.get(id);
198             if (concept != null) {
199                 for (final String parentID : concept.parents) {
200                     final IRI parentIRI = SimpleValueFactory.getInstance().createIRI(
201                             NAMESPACE + parentID);
202                     if (result.add(parentIRI) && recursive) {
203                         queue.add(parentIRI);
204                     }
205                 }
206             }
207         }
208         return result;
209     }
210 
211     public static boolean isSubClassOf(final IRI childIRI, final IRI parentIRI) {
212         if (childIRI.equals(parentIRI)) {
213             return true;
214         }
215         final String childID = childIRI.stringValue().substring(NAMESPACE.length());
216         final Concept child = ID_INDEX.get(childID);
217         if (child == null) {
218             return false;
219         }
220         for (final String parentID : child.parents) {
221             final IRI uri = SimpleValueFactory.getInstance().createIRI(NAMESPACE + parentID);
222             if (isSubClassOf(uri, parentIRI)) {
223                 return true;
224             }
225         }
226         return false;
227     }
228 
229     public static void main(final String... args) {
230         try {
231             final CommandLine cmd = CommandLine
232                     .parser()
233                     .withName("eu.fbk.dkm.pikes.resources.YagoTaxonomy")
234                     .withHeader(
235                             "Generate a TSV file with mappings from offsets to DBpedia Yago IRIs")
236                     .withOption("i", "input", "the input RDF file with the DBpedia Yago taxonomy",
237                             "FILE", Type.FILE_EXISTING, true, false, true)
238                     .withOption("o", "output", "the output TSV file", "FILE", Type.FILE, true,
239                             false, true).withLogger(LoggerFactory.getLogger("eu.fbk")).parse(args);
240 
241             final File input = cmd.getOptionValue("i", File.class);
242             final File output = cmd.getOptionValue("o", File.class);
243 
244             final Set<String> ids = Sets.newHashSet();
245             final Multimap<String, String> parents = HashMultimap.create();
246             final RDFSource source = RDFSources.read(false, true, null, null, null, true,
247                     input.getAbsolutePath());
248             source.emit(new AbstractRDFHandler() {
249 
250                 @Override
251                 public void handleStatement(final Statement stmt) throws RDFHandlerException {
252                     final Resource s = stmt.getSubject();
253                     final IRI p = stmt.getPredicate();
254                     final Value o = stmt.getObject();
255                     if (p.equals(RDFS.SUBCLASSOF) && s instanceof IRI && o instanceof IRI
256                             && s.stringValue().startsWith(NAMESPACE)
257                             && o.stringValue().startsWith(NAMESPACE)) {
258                         final String childID = s.stringValue().substring(NAMESPACE.length());
259                         final String parentID = o.stringValue().substring(NAMESPACE.length());
260                         if (getSynsetID((IRI) o) != null) {
261                             ids.add(parentID);
262                         }
263                         if (getSynsetID((IRI) s) != null) {
264                             ids.add(childID);
265                             parents.put(childID, parentID);
266                         }
267                     }
268                 }
269 
270             }, 1);
271 
272             final List<String> sortedIDs = Ordering.natural().immutableSortedCopy(ids);
273 
274             int counter = 0;
275             final Map<String, Integer> nums = Maps.newHashMap();
276             for (final String id : sortedIDs) {
277                 nums.put(id, counter++);
278             }
279 
280             try (Writer writer = IO.utf8Writer(IO.buffer(IO.write(output.getAbsolutePath())))) {
281                 for (int childNum = 0; childNum < sortedIDs.size(); ++childNum) {
282                     final String childID = sortedIDs.get(childNum);
283                     writer.write(childID);
284                     for (final String parentID : parents.get(childID)) {
285                         final Integer parentNum = nums.get(parentID);
286                         if (parentNum != null) {
287                             writer.write("\t");
288                             writer.write(Integer.toString(parentNum));
289                         }
290                     }
291                     writer.write("\n");
292                 }
293             }
294 
295             LOGGER.info("Emitted {} mappings", sortedIDs.size());
296 
297         } catch (final Throwable ex) {
298             CommandLine.fail(ex);
299         }
300     }
301 
302     private static final class Concept {
303 
304         public final String id;
305 
306         public final String[] parents;
307 
308         public final String[] children;
309 
310         Concept(final String id, final String[] parents, final String[] children) {
311             this.id = id;
312             this.parents = parents;
313             this.children = children;
314         }
315 
316     }
317 
318 }