1 package eu.fbk.dkm.pikes.query;
2
3 import java.io.File;
4 import java.io.Writer;
5 import java.util.List;
6 import java.util.Map;
7 import java.util.regex.Matcher;
8 import java.util.regex.Pattern;
9
10 import com.google.common.collect.Lists;
11 import com.google.common.collect.Maps;
12 import com.google.common.collect.Ordering;
13 import com.google.common.xml.XmlEscapers;
14
15 import org.eclipse.rdf4j.model.Literal;
16 import org.eclipse.rdf4j.model.Resource;
17 import org.eclipse.rdf4j.model.Statement;
18 import org.eclipse.rdf4j.model.IRI;
19 import org.eclipse.rdf4j.model.Value;
20 import org.eclipse.rdf4j.model.ValueFactory;
21 import org.eclipse.rdf4j.model.vocabulary.DC;
22 import org.eclipse.rdf4j.model.vocabulary.RDF;
23 import org.slf4j.Logger;
24 import org.slf4j.LoggerFactory;
25
26 import eu.fbk.utils.core.CommandLine;
27 import eu.fbk.dkm.pikes.rdf.vocab.NIF;
28 import eu.fbk.rdfpro.RDFSources;
29 import eu.fbk.rdfpro.util.IO;
30 import eu.fbk.rdfpro.util.QuadModel;
31 import eu.fbk.rdfpro.util.Statements;
32
33 public final class Yovisto {
34
35 private static final Logger LOGGER = LoggerFactory.getLogger(Yovisto.class);
36
37 private static final ValueFactory VF = Statements.VALUE_FACTORY;
38
39 private static final IRI SI_QUERY = VF.createIRI("http://sindice.com/vocab/search#Query");
40
41 private static final IRI SI_RESULT = VF.createIRI("http://sindice.com/vocab/search#result");
42
43 private static final IRI SI_RANK = VF.createIRI("http://sindice.com/vocab/search#rank");
44
45 private static final IRI YV_QUERY_ID = VF.createIRI("http://yovisto.com/eval#queryId");
46
47 private static final IRI YV_DOCUMENT_ID = VF.createIRI("http://yovisto.com/eval#documentId");
48
49 private static final IRI YV_DOCUMENT = VF.createIRI("http://yovisto.com/eval#Document");
50
51 private static final IRI ITSRDF_TA_IDENT_REF = VF
52 .createIRI("http://www.w3.org/2005/11/its/rdf#taIdentRef");
53
54 private static final Pattern SPLIT_PATTERN = Pattern
55 .compile("[^ ][ ]([ ]+[A-Z]|The |This |That |These |Those |My |Your |His |Her |Its "
56 + "|Our |Their |Whose |A |An |Some |Any |Much |Many |Little |Few |More |Most "
57 + "|Less |Fewer |Least |Fewest |Very |Too |So |Not |Lots of |Plenty of "
58 + "|Half of |Twice |All |Both |Enough |No |Almost |Over |More than "
59 + "|Less than |Each |Every |Either |Neither |You |He [a-zA-Z]|She |We |They "
60 + "|Such |What |On |In |At |Since |For |After |Before |To |Until |By |Beside "
61 + "|Under |Below |Over |Above |Across |Through |Into |Towards |Onto |From "
62 + "|Off |Out of |About |But |And |Or |Although |As |Even |If |Now |Once "
63 + "|Rather |Since |That |Though |Unless |When |Whenever |Where |Whereas "
64 + "|Wherever |While |Whether |However |Moreover |Nevertheless |Consequently "
65 + "|Already |Throughout |Further |Back |Also |Because |Finally )");
66
67 private static final Pattern REMOVE_PATTERN = Pattern.compile("\\[[0-9]+(,[0-9]+)*\\]");
68
69 public static void main(final String[] args) {
70 try {
71
72 final CommandLine cmd = CommandLine
73 .parser()
74 .withName("yovisto")
75 .withOption("i", "input", "the input RDF file with the Yovisto dataset",
76 "PATH", CommandLine.Type.FILE_EXISTING, true, false, true)
77 .withOption("o", "output", "output base name", "PATH",
78 CommandLine.Type.STRING, true, false, true)
79 .withHeader("parses the Yovisto file and emits NAF files for each document")
80 .parse(args);
81
82
83 final File input = cmd.getOptionValue("i", File.class);
84 final String output = cmd.getOptionValue("o", String.class);
85
86
87 final QuadModel model = QuadModel.create();
88 for (final Statement stmt : RDFSources.read(false, true, null, null,null,true,
89 input.getAbsolutePath())) {
90 try {
91 model.add(stmt);
92 } catch (final Throwable ex) {
93 LOGGER.error("Ignoring wrong statement: " + stmt);
94 }
95 }
96
97
98 final Map<IRI, String> ids = Maps.newHashMap();
99
100
101 int numResults = 0;
102 final List<String> queryLines = Lists.newArrayList();
103 for (final Resource query : model.filter(null, RDF.TYPE, SI_QUERY).subjects()) {
104 final String id = String.format("q%02d", model.filter(query, YV_QUERY_ID, null)
105 .objectLiteral().intValue());
106 ids.put((IRI) query, id);
107 final String text = fixQuery(model.filter(query, NIF.IS_STRING, null)
108 .objectLiteral().stringValue());
109 final Map<Integer, String> resultMap = Maps.newHashMap();
110 final Map<String, Integer> rankMap = Maps.newHashMap();
111 for (final Value result : model.filter(query, SI_RESULT, null).objects()) {
112 final IRI uri = (IRI) result;
113 final int num = Integer.parseInt(uri.getLocalName());
114 final int rank = model.filter(uri, SI_RANK, null).objectLiteral().intValue();
115 final String documentId = String.format("d%03d",
116 model.filter(uri, YV_DOCUMENT_ID, null).objectLiteral().intValue());
117 resultMap.put(num, documentId);
118 rankMap.put(documentId, rank);
119 }
120 final StringBuilder builder = new StringBuilder();
121 builder.append(id).append('\t').append(text).append("\t");
122 String separator = "";
123 for (final Integer num : Ordering.natural().sortedCopy(resultMap.keySet())) {
124 final String documentId = resultMap.get(num);
125 final Integer rank = rankMap.get(documentId);
126 builder.append(separator).append(documentId).append(':').append(rank);
127 separator = ",";
128 ++numResults;
129 }
130 queryLines.add(builder.toString());
131 final int index = query.stringValue().indexOf('#');
132 final IRI queryIRI = index < 0 ? (IRI) query : VF.createIRI(query.stringValue()
133 .substring(0, index));
134 try (Writer writer = IO
135 .utf8Writer(IO.buffer(IO.write(output + "." + id + ".naf")))) {
136 writer.write("<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n");
137 writer.write("<NAF xml:lang=\"en\" version=\"v3\">\n");
138 writer.write(" <nafHeader>\n");
139 writer.write(" <fileDesc creationtime=\"2015-07-09T00:00:00+00:00\" />\n");
140 writer.write(" <public publicId=\""
141 + XmlEscapers.xmlAttributeEscaper().escape(id) + "\" uri=\""
142 + XmlEscapers.xmlAttributeEscaper().escape(queryIRI.stringValue())
143 + "\"/>\n");
144 writer.write(" </nafHeader>\n");
145 writer.write(" <raw><![CDATA[");
146 writer.write(text);
147 writer.write("]]></raw>\n");
148 writer.write("</NAF>\n");
149 }
150 }
151 try (Writer writer = IO.utf8Writer(IO.buffer(IO.write(output + ".queries")))) {
152 for (final String line : Ordering.natural().sortedCopy(queryLines)) {
153 writer.write(line);
154 writer.write("\n");
155 }
156 }
157 LOGGER.info("Emitted {} queries with {} results", queryLines.size(), numResults);
158
159
160 for (final Resource document : model.filter(null, RDF.TYPE, YV_DOCUMENT).subjects()) {
161 final String id = String.format("d%03d",
162 model.filter(document, YV_DOCUMENT_ID, null).objectLiteral().intValue());
163 final String title = model.filter(document, DC.TITLE, null).objectLiteral()
164 .stringValue().trim();
165 final String text = fixDocument(model.filter(document, NIF.IS_STRING, null)
166 .objectLiteral().stringValue());
167 final int index = document.stringValue().indexOf('#');
168 final IRI documentIRI = index < 0 ? (IRI) document : VF.createIRI(document
169 .stringValue().substring(0, index));
170 ids.put((IRI) document, id);
171 try (Writer writer = IO
172 .utf8Writer(IO.buffer(IO.write(output + "." + id + ".naf")))) {
173 writer.write("<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n");
174 writer.write("<NAF xml:lang=\"en\" version=\"v3\">\n");
175 writer.write(" <nafHeader>\n");
176 writer.write(" <fileDesc creationtime=\"2015-07-09T00:00:00+00:00\" title=\""
177 + XmlEscapers.xmlAttributeEscaper().escape(title) + "\"/>\n");
178 writer.write(" <public publicId=\""
179 + XmlEscapers.xmlAttributeEscaper().escape(id) + "\" uri=\""
180 + XmlEscapers.xmlAttributeEscaper().escape(documentIRI.stringValue())
181 + "\"/>\n");
182 writer.write(" </nafHeader>\n");
183 writer.write(" <raw><![CDATA[");
184 writer.write(title);
185 if (!title.endsWith(".")) {
186 writer.write(".");
187 }
188 writer.write("\n\n");
189 writer.write(text);
190 writer.write("]]></raw>\n");
191 writer.write("</NAF>\n");
192 }
193 }
194
195
196 final Map<String, String> entityLines = Maps.newHashMap();
197 for (final Statement stmt : model.filter(null, ITSRDF_TA_IDENT_REF, null)) {
198 final IRI entity = (IRI) stmt.getSubject();
199 final IRI reference = VF.createIRI(stmt.getObject().stringValue());
200 final IRI context = VF.createIRI(model.filter(entity, NIF.REFERENCE_CONTEXT, null)
201 .objectValue().stringValue());
202 final String id = ids.get(context);
203 final String text = model.filter(entity, NIF.ANCHOR_OF, null).objectLiteral()
204 .stringValue();
205 final int begin = getInt(model, entity, NIF.BEGIN_INDEX);
206 final int end = getInt(model, entity, NIF.END_INDEX);
207 entityLines.put(
208 id + String.format("%04d", begin),
209 String.format("%s\t%d\t%d\t%s\t%s", id, begin, end, text,
210 reference.stringValue()));
211 }
212 try (Writer writer = IO.utf8Writer(IO.buffer(IO.write(output + ".entities")))) {
213 for (final String key : Ordering.natural().sortedCopy(entityLines.keySet())) {
214 writer.append(entityLines.get(key));
215 writer.append("\n");
216 }
217 }
218
219 } catch (final Throwable ex) {
220
221 CommandLine.fail(ex);
222 }
223 }
224
225 private static String fixQuery(String string) {
226 if (string.equals("Famous German Poetry")) {
227 string = "famous German poetry";
228 } else if (string.equals("University of Edinburgh Research")) {
229 string = "University of Edinburgh research";
230 } else if (string.equals("Bridge Construction")) {
231 string = "bridge construction";
232 } else if (string.equals("Walk of fame stars")) {
233 string = "Walk of Fame stars";
234 } else if (string.equals("Invention of the internet")) {
235 string = "Invention of the Internet";
236 } else if (string.equals("Early Telecommunication Methods")) {
237 string = "early telecommunication methods";
238 } else if (string.equals("Famous Members of the Royal Navy")) {
239 string = "famous members of the Royal Navy";
240 } else if (string.equals("Nobel Prize Winning inventions")) {
241 string = "Nobel Prize winning inventions";
242 } else if (string.equals("Edward Teller & Marie Curie")) {
243 string = "Edward Teller and Marie Curie";
244 } else if (string
245 .equals("Computing Language for the programming of artificial intelligence")) {
246 string = "Computing Language for the programming of Artificial Intelligence";
247 } else if (string.equals("William Hearst Movie")) {
248 string = "William Hearst movie";
249 } else if (string.equals("Nazis confiscate / destroy art and literature")) {
250 string = "Nazis confiscate or destroy art and literature";
251 } else if (string.equals("Modern Physiology")) {
252 string = "modern Physiology";
253 } else if (string.equals("Aviation pioneers publications")) {
254 string = "Aviation pioneers' publications";
255 } else if (string.equals("Skinner's experiments with the Operant conditioning chamber")) {
256 string = "Skinner's experiments with the operant conditioning chamber";
257 } else if (string.equals("First woman who won a nobel prize")) {
258 string = "First woman who won a Nobel Prize";
259 }
260 return string;
261 }
262
263 private static String fixDocument(final String string) {
264 final StringBuilder builder = new StringBuilder();
265 Matcher m = SPLIT_PATTERN.matcher(string);
266 int end = 0;
267 while (m.find()) {
268 builder.append(string.substring(end, m.start()));
269 end = m.end();
270 final char c = string.charAt(m.start());
271 if (c != '.' && c != ':') {
272 if (LOGGER.isDebugEnabled()) {
273 LOGGER.debug("Splitted '"
274 + string.substring(Math.max(0, m.start() - 20), m.start() + 1)
275 + " | "
276 + string.substring(m.start() + 2,
277 Math.min(string.length(), m.end() + 20)));
278 }
279 builder.append(string.charAt(m.start()));
280 builder.append(". ");
281 builder.append(m.group().substring(1));
282
283 } else {
284 builder.append(m.group());
285 }
286 }
287 builder.append(string.substring(end));
288 m = REMOVE_PATTERN.matcher(builder);
289 while (m.find()) {
290 for (int i = m.start(); i < m.end(); ++i) {
291 builder.setCharAt(i, ' ');
292 }
293 }
294 return builder.toString();
295 }
296
297 private static int getInt(final QuadModel model, final Resource subject, final IRI property) {
298 for (final Value value : model.filter(subject, property, null).objects()) {
299 try {
300 return ((Literal) value).intValue();
301 } catch (final Throwable ex) {
302 LOGGER.error("Not an integer: " + value);
303 }
304 }
305 throw new IllegalArgumentException("Missing " + property + " for " + subject);
306 }
307
308 private Yovisto() {
309 }
310
311 }