1 package eu.fbk.dkm.pikes.resources;
2
3 import java.io.File;
4 import java.io.Writer;
5 import java.util.Collection;
6 import java.util.List;
7 import java.util.Map;
8 import java.util.Set;
9
10 import javax.annotation.Nullable;
11
12 import com.google.common.base.Charsets;
13 import com.google.common.collect.HashMultimap;
14 import com.google.common.collect.ImmutableMap;
15 import com.google.common.collect.Iterables;
16 import com.google.common.collect.Lists;
17 import com.google.common.collect.Maps;
18 import com.google.common.collect.Multimap;
19 import com.google.common.collect.Ordering;
20 import com.google.common.collect.Sets;
21 import com.google.common.io.Resources;
22
23 import org.eclipse.rdf4j.model.Resource;
24 import org.eclipse.rdf4j.model.Statement;
25 import org.eclipse.rdf4j.model.IRI;
26 import org.eclipse.rdf4j.model.Value;
27 import org.eclipse.rdf4j.model.impl.SimpleValueFactory;
28 import org.eclipse.rdf4j.model.impl.ValueFactoryImpl;
29 import org.eclipse.rdf4j.model.vocabulary.RDFS;
30 import org.eclipse.rdf4j.rio.RDFHandlerException;
31 import org.slf4j.Logger;
32 import org.slf4j.LoggerFactory;
33
34 import eu.fbk.utils.core.CommandLine;
35 import eu.fbk.utils.core.CommandLine.Type;
36 import eu.fbk.rdfpro.AbstractRDFHandler;
37 import eu.fbk.rdfpro.RDFSource;
38 import eu.fbk.rdfpro.RDFSources;
39 import eu.fbk.rdfpro.tql.TQL;
40 import eu.fbk.rdfpro.util.IO;
41
42 public final class YagoTaxonomy {
43
44 public static final String NAMESPACE = "http://dbpedia.org/class/yago/";
45
46 private static final Map<String, Concept> ID_INDEX;
47
48 private static final Map<Integer, Concept> OFFSET_INDEX;
49
50 private static final Logger LOGGER = LoggerFactory.getLogger(YagoTaxonomy.class);
51
52 static {
53 try {
54 final List<String> ids = Lists.newArrayList();
55 final Map<Integer, String> offsetMap = Maps.newHashMap();
56 final Multimap<Integer, Integer> parentsMap = HashMultimap.create();
57 final Multimap<Integer, Integer> childrenMap = HashMultimap.create();
58 for (final String line : Resources.readLines(
59 YagoTaxonomy.class.getResource("YagoTaxonomy.tsv"), Charsets.UTF_8)) {
60 final String[] tokens = line.split("\t");
61 if (tokens.length > 0) {
62 final int num = ids.size();
63 final String id = tokens[0];
64 ids.add(id);
65 final int len = id.length();
66 if (len > 9) {
67 try {
68 final int offset = Integer.parseInt(id.substring(len - 8));
69 offsetMap.put(offset, id);
70 } catch (final NumberFormatException ex) {
71
72 }
73 }
74 for (int i = 1; i < tokens.length; ++i) {
75 final int parentNum = Integer.parseInt(tokens[i]);
76 parentsMap.put(num, parentNum);
77 childrenMap.put(parentNum, num);
78 }
79 }
80 }
81
82 final String[] emptyIDs = new String[0];
83 final ImmutableMap.Builder<String, Concept> idIndexBuilder = ImmutableMap.builder();
84 for (int num = 0; num < ids.size(); ++num) {
85 final String id = ids.get(num);
86 final Collection<Integer> parentNums = parentsMap.get(num);
87 final Collection<Integer> childrenNums = childrenMap.get(num);
88 final int numParents = parentNums.size();
89 final int numChildren = childrenNums.size();
90 final String[] parentIDs = numParents == 0 ? emptyIDs : new String[numParents];
91 final String[] childrenIDs = numChildren == 0 ? emptyIDs : new String[numChildren];
92 int index = 0;
93 for (final Integer parentNum : parentNums) {
94 parentIDs[index++] = ids.get(parentNum);
95 }
96 index = 0;
97 for (final Integer childrenNum : childrenNums) {
98 childrenIDs[index++] = ids.get(childrenNum);
99 }
100 final Concept concept = new Concept(id, parentIDs, childrenIDs);
101 idIndexBuilder.put(id, concept);
102 }
103 ID_INDEX = idIndexBuilder.build();
104
105 final ImmutableMap.Builder<Integer, Concept> offsetIndexBuilder = ImmutableMap
106 .builder();
107 for (final Map.Entry<Integer, String> entry : offsetMap.entrySet()) {
108 offsetIndexBuilder.put(entry.getKey(), ID_INDEX.get(entry.getValue()));
109 }
110 OFFSET_INDEX = offsetIndexBuilder.build();
111
112 } catch (final Exception ex) {
113 throw new Error(ex);
114 }
115 }
116
117 @Nullable
118 public static IRI getDBpediaYagoIRI(@Nullable final String synsetID) {
119 if (synsetID != null) {
120 final Integer offset = Integer.valueOf(synsetID.substring(0, synsetID.length() - 2));
121 final Concept concept = OFFSET_INDEX.get(offset);
122 if (concept != null) {
123 return SimpleValueFactory.getInstance() .createIRI(NAMESPACE + concept.id);
124 }
125 }
126 return null;
127 }
128
129 public static Set<IRI> getDBpediaYagoIRIs(@Nullable final Iterable<String> synsetIDs) {
130 final Set<IRI> uris = Sets.newHashSet();
131 final Set<String> hypernyms = Sets.newHashSet();
132 final List<String> queue = Lists.newLinkedList();
133 if (synsetIDs != null) {
134 Iterables.addAll(queue, synsetIDs);
135 }
136 while (!queue.isEmpty()) {
137 final String synsetID = queue.remove(0);
138 final IRI uri = getDBpediaYagoIRI(synsetID);
139 if (uri != null) {
140 uris.add(uri);
141 } else {
142 for (final String hypernym : WordNet.getHypernyms(synsetID)) {
143 if (hypernyms.add(hypernym)) {
144 queue.add(hypernym);
145 }
146 }
147 }
148 }
149 return uris;
150 }
151
152 @Nullable
153 public static String getSynsetID(@Nullable final IRI dbpediaYagoIRI) {
154 if (dbpediaYagoIRI != null && dbpediaYagoIRI.stringValue().startsWith(NAMESPACE)) {
155 final String s = dbpediaYagoIRI.stringValue();
156 final int l = s.length();
157 if (l > 9) {
158 for (int i = l - 9; i < l; ++i) {
159 if (!Character.isDigit(s.charAt(i))) {
160 return null;
161 }
162 }
163 return s.substring(l - 8) + "-n";
164 }
165 }
166 return null;
167 }
168
169 public static Set<IRI> getSubClasses(final IRI parentIRI, final boolean recursive) {
170 final Set<IRI> result = Sets.newHashSet();
171 final List<IRI> queue = Lists.newLinkedList();
172 queue.add(parentIRI);
173 while (!queue.isEmpty()) {
174 final IRI uri = queue.remove(0);
175 final String id = uri.stringValue().substring(NAMESPACE.length());
176 final Concept concept = ID_INDEX.get(id);
177 if (concept != null) {
178 for (final String childID : concept.children) {
179 final IRI childIRI = SimpleValueFactory.getInstance().createIRI(
180 NAMESPACE + childID);
181 if (result.add(childIRI) && recursive) {
182 queue.add(childIRI);
183 }
184 }
185 }
186 }
187 return result;
188 }
189
190 public static Set<IRI> getSuperClasses(final IRI childIRI, final boolean recursive) {
191 final Set<IRI> result = Sets.newHashSet();
192 final List<IRI> queue = Lists.newLinkedList();
193 queue.add(childIRI);
194 while (!queue.isEmpty()) {
195 final IRI uri = queue.remove(0);
196 final String id = uri.stringValue().substring(NAMESPACE.length());
197 final Concept concept = ID_INDEX.get(id);
198 if (concept != null) {
199 for (final String parentID : concept.parents) {
200 final IRI parentIRI = SimpleValueFactory.getInstance().createIRI(
201 NAMESPACE + parentID);
202 if (result.add(parentIRI) && recursive) {
203 queue.add(parentIRI);
204 }
205 }
206 }
207 }
208 return result;
209 }
210
211 public static boolean isSubClassOf(final IRI childIRI, final IRI parentIRI) {
212 if (childIRI.equals(parentIRI)) {
213 return true;
214 }
215 final String childID = childIRI.stringValue().substring(NAMESPACE.length());
216 final Concept child = ID_INDEX.get(childID);
217 if (child == null) {
218 return false;
219 }
220 for (final String parentID : child.parents) {
221 final IRI uri = SimpleValueFactory.getInstance().createIRI(NAMESPACE + parentID);
222 if (isSubClassOf(uri, parentIRI)) {
223 return true;
224 }
225 }
226 return false;
227 }
228
229 public static void main(final String... args) {
230 try {
231 final CommandLine cmd = CommandLine
232 .parser()
233 .withName("eu.fbk.dkm.pikes.resources.YagoTaxonomy")
234 .withHeader(
235 "Generate a TSV file with mappings from offsets to DBpedia Yago IRIs")
236 .withOption("i", "input", "the input RDF file with the DBpedia Yago taxonomy",
237 "FILE", Type.FILE_EXISTING, true, false, true)
238 .withOption("o", "output", "the output TSV file", "FILE", Type.FILE, true,
239 false, true).withLogger(LoggerFactory.getLogger("eu.fbk")).parse(args);
240
241 final File input = cmd.getOptionValue("i", File.class);
242 final File output = cmd.getOptionValue("o", File.class);
243
244 final Set<String> ids = Sets.newHashSet();
245 final Multimap<String, String> parents = HashMultimap.create();
246 final RDFSource source = RDFSources.read(false, true, null, null, null, true,
247 input.getAbsolutePath());
248 source.emit(new AbstractRDFHandler() {
249
250 @Override
251 public void handleStatement(final Statement stmt) throws RDFHandlerException {
252 final Resource s = stmt.getSubject();
253 final IRI p = stmt.getPredicate();
254 final Value o = stmt.getObject();
255 if (p.equals(RDFS.SUBCLASSOF) && s instanceof IRI && o instanceof IRI
256 && s.stringValue().startsWith(NAMESPACE)
257 && o.stringValue().startsWith(NAMESPACE)) {
258 final String childID = s.stringValue().substring(NAMESPACE.length());
259 final String parentID = o.stringValue().substring(NAMESPACE.length());
260 if (getSynsetID((IRI) o) != null) {
261 ids.add(parentID);
262 }
263 if (getSynsetID((IRI) s) != null) {
264 ids.add(childID);
265 parents.put(childID, parentID);
266 }
267 }
268 }
269
270 }, 1);
271
272 final List<String> sortedIDs = Ordering.natural().immutableSortedCopy(ids);
273
274 int counter = 0;
275 final Map<String, Integer> nums = Maps.newHashMap();
276 for (final String id : sortedIDs) {
277 nums.put(id, counter++);
278 }
279
280 try (Writer writer = IO.utf8Writer(IO.buffer(IO.write(output.getAbsolutePath())))) {
281 for (int childNum = 0; childNum < sortedIDs.size(); ++childNum) {
282 final String childID = sortedIDs.get(childNum);
283 writer.write(childID);
284 for (final String parentID : parents.get(childID)) {
285 final Integer parentNum = nums.get(parentID);
286 if (parentNum != null) {
287 writer.write("\t");
288 writer.write(Integer.toString(parentNum));
289 }
290 }
291 writer.write("\n");
292 }
293 }
294
295 LOGGER.info("Emitted {} mappings", sortedIDs.size());
296
297 } catch (final Throwable ex) {
298 CommandLine.fail(ex);
299 }
300 }
301
302 private static final class Concept {
303
304 public final String id;
305
306 public final String[] parents;
307
308 public final String[] children;
309
310 Concept(final String id, final String[] parents, final String[] children) {
311 this.id = id;
312 this.parents = parents;
313 this.children = children;
314 }
315
316 }
317
318 }