1   package eu.fbk.dkm.pikes.query;
2   
3   import java.io.File;
4   import java.io.Writer;
5   import java.util.List;
6   import java.util.Map;
7   import java.util.regex.Matcher;
8   import java.util.regex.Pattern;
9   
10  import com.google.common.collect.Lists;
11  import com.google.common.collect.Maps;
12  import com.google.common.collect.Ordering;
13  import com.google.common.xml.XmlEscapers;
14  
15  import org.eclipse.rdf4j.model.Literal;
16  import org.eclipse.rdf4j.model.Resource;
17  import org.eclipse.rdf4j.model.Statement;
18  import org.eclipse.rdf4j.model.IRI;
19  import org.eclipse.rdf4j.model.Value;
20  import org.eclipse.rdf4j.model.ValueFactory;
21  import org.eclipse.rdf4j.model.vocabulary.DC;
22  import org.eclipse.rdf4j.model.vocabulary.RDF;
23  import org.slf4j.Logger;
24  import org.slf4j.LoggerFactory;
25  
26  import eu.fbk.utils.core.CommandLine;
27  import eu.fbk.dkm.pikes.rdf.vocab.NIF;
28  import eu.fbk.rdfpro.RDFSources;
29  import eu.fbk.rdfpro.util.IO;
30  import eu.fbk.rdfpro.util.QuadModel;
31  import eu.fbk.rdfpro.util.Statements;
32  
33  public final class Yovisto {
34  
35      private static final Logger LOGGER = LoggerFactory.getLogger(Yovisto.class);
36  
37      private static final ValueFactory VF = Statements.VALUE_FACTORY;
38  
39      private static final IRI SI_QUERY = VF.createIRI("http://sindice.com/vocab/search#Query");
40  
41      private static final IRI SI_RESULT = VF.createIRI("http://sindice.com/vocab/search#result");
42  
43      private static final IRI SI_RANK = VF.createIRI("http://sindice.com/vocab/search#rank");
44  
45      private static final IRI YV_QUERY_ID = VF.createIRI("http://yovisto.com/eval#queryId");
46  
47      private static final IRI YV_DOCUMENT_ID = VF.createIRI("http://yovisto.com/eval#documentId");
48  
49      private static final IRI YV_DOCUMENT = VF.createIRI("http://yovisto.com/eval#Document");
50  
51      private static final IRI ITSRDF_TA_IDENT_REF = VF
52              .createIRI("http://www.w3.org/2005/11/its/rdf#taIdentRef");
53  
54      private static final Pattern SPLIT_PATTERN = Pattern
55              .compile("[^ ][ ]([ ]+[A-Z]|The |This |That |These |Those |My |Your |His |Her |Its "
56                      + "|Our |Their |Whose |A |An |Some |Any |Much |Many |Little |Few |More |Most "
57                      + "|Less |Fewer |Least |Fewest |Very |Too |So |Not |Lots of |Plenty of "
58                      + "|Half of |Twice |All |Both |Enough |No |Almost |Over |More than "
59                      + "|Less than |Each |Every |Either |Neither |You |He [a-zA-Z]|She |We |They "
60                      + "|Such |What |On |In |At |Since |For |After |Before |To |Until |By |Beside "
61                      + "|Under |Below |Over |Above |Across |Through |Into |Towards |Onto |From "
62                      + "|Off |Out of |About |But |And |Or |Although |As |Even |If |Now |Once "
63                      + "|Rather |Since |That |Though |Unless |When |Whenever |Where |Whereas "
64                      + "|Wherever |While |Whether |However |Moreover |Nevertheless |Consequently "
65                      + "|Already |Throughout |Further |Back |Also |Because |Finally )");
66  
67      private static final Pattern REMOVE_PATTERN = Pattern.compile("\\[[0-9]+(,[0-9]+)*\\]");
68  
69      public static void main(final String[] args) {
70          try {
71              // Parse command line
72              final CommandLine cmd = CommandLine
73                      .parser()
74                      .withName("yovisto")
75                      .withOption("i", "input", "the input RDF file with the Yovisto dataset",
76                              "PATH", CommandLine.Type.FILE_EXISTING, true, false, true)
77                      .withOption("o", "output", "output base name", "PATH",
78                              CommandLine.Type.STRING, true, false, true)
79                      .withHeader("parses the Yovisto file and emits NAF files for each document")
80                      .parse(args);
81  
82              // Extract options
83              final File input = cmd.getOptionValue("i", File.class);
84              final String output = cmd.getOptionValue("o", String.class);
85  
86              // Read RDF file
87              final QuadModel model = QuadModel.create();
88              for (final Statement stmt : RDFSources.read(false, true, null, null,null,true,
89                      input.getAbsolutePath())) {
90                  try {
91                      model.add(stmt);
92                  } catch (final Throwable ex) {
93                      LOGGER.error("Ignoring wrong statement: " + stmt);
94                  }
95              }
96  
97              // Define a IRI -> ID map
98              final Map<IRI, String> ids = Maps.newHashMap();
99  
100             // Emit queries
101             int numResults = 0;
102             final List<String> queryLines = Lists.newArrayList();
103             for (final Resource query : model.filter(null, RDF.TYPE, SI_QUERY).subjects()) {
104                 final String id = String.format("q%02d", model.filter(query, YV_QUERY_ID, null)
105                         .objectLiteral().intValue());
106                 ids.put((IRI) query, id);
107                 final String text = fixQuery(model.filter(query, NIF.IS_STRING, null)
108                         .objectLiteral().stringValue());
109                 final Map<Integer, String> resultMap = Maps.newHashMap();
110                 final Map<String, Integer> rankMap = Maps.newHashMap();
111                 for (final Value result : model.filter(query, SI_RESULT, null).objects()) {
112                     final IRI uri = (IRI) result;
113                     final int num = Integer.parseInt(uri.getLocalName());
114                     final int rank = model.filter(uri, SI_RANK, null).objectLiteral().intValue();
115                     final String documentId = String.format("d%03d",
116                             model.filter(uri, YV_DOCUMENT_ID, null).objectLiteral().intValue());
117                     resultMap.put(num, documentId);
118                     rankMap.put(documentId, rank);
119                 }
120                 final StringBuilder builder = new StringBuilder();
121                 builder.append(id).append('\t').append(text).append("\t");
122                 String separator = "";
123                 for (final Integer num : Ordering.natural().sortedCopy(resultMap.keySet())) {
124                     final String documentId = resultMap.get(num);
125                     final Integer rank = rankMap.get(documentId);
126                     builder.append(separator).append(documentId).append(':').append(rank);
127                     separator = ",";
128                     ++numResults;
129                 }
130                 queryLines.add(builder.toString());
131                 final int index = query.stringValue().indexOf('#');
132                 final IRI queryIRI = index < 0 ? (IRI) query : VF.createIRI(query.stringValue()
133                         .substring(0, index));
134                 try (Writer writer = IO
135                         .utf8Writer(IO.buffer(IO.write(output + "." + id + ".naf")))) {
136                     writer.write("<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n");
137                     writer.write("<NAF xml:lang=\"en\" version=\"v3\">\n");
138                     writer.write("  <nafHeader>\n");
139                     writer.write("    <fileDesc creationtime=\"2015-07-09T00:00:00+00:00\" />\n");
140                     writer.write("    <public publicId=\""
141                             + XmlEscapers.xmlAttributeEscaper().escape(id) + "\" uri=\""
142                             + XmlEscapers.xmlAttributeEscaper().escape(queryIRI.stringValue())
143                             + "\"/>\n");
144                     writer.write("  </nafHeader>\n");
145                     writer.write("  <raw><![CDATA[");
146                     writer.write(text);
147                     writer.write("]]></raw>\n");
148                     writer.write("</NAF>\n");
149                 }
150             }
151             try (Writer writer = IO.utf8Writer(IO.buffer(IO.write(output + ".queries")))) {
152                 for (final String line : Ordering.natural().sortedCopy(queryLines)) {
153                     writer.write(line);
154                     writer.write("\n");
155                 }
156             }
157             LOGGER.info("Emitted {} queries with {} results", queryLines.size(), numResults);
158 
159             // Emit NAF documents
160             for (final Resource document : model.filter(null, RDF.TYPE, YV_DOCUMENT).subjects()) {
161                 final String id = String.format("d%03d",
162                         model.filter(document, YV_DOCUMENT_ID, null).objectLiteral().intValue());
163                 final String title = model.filter(document, DC.TITLE, null).objectLiteral()
164                         .stringValue().trim();
165                 final String text = fixDocument(model.filter(document, NIF.IS_STRING, null)
166                         .objectLiteral().stringValue());
167                 final int index = document.stringValue().indexOf('#');
168                 final IRI documentIRI = index < 0 ? (IRI) document : VF.createIRI(document
169                         .stringValue().substring(0, index));
170                 ids.put((IRI) document, id);
171                 try (Writer writer = IO
172                         .utf8Writer(IO.buffer(IO.write(output + "." + id + ".naf")))) {
173                     writer.write("<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n");
174                     writer.write("<NAF xml:lang=\"en\" version=\"v3\">\n");
175                     writer.write("  <nafHeader>\n");
176                     writer.write("    <fileDesc creationtime=\"2015-07-09T00:00:00+00:00\" title=\""
177                             + XmlEscapers.xmlAttributeEscaper().escape(title) + "\"/>\n");
178                     writer.write("    <public publicId=\""
179                             + XmlEscapers.xmlAttributeEscaper().escape(id) + "\" uri=\""
180                             + XmlEscapers.xmlAttributeEscaper().escape(documentIRI.stringValue())
181                             + "\"/>\n");
182                     writer.write("  </nafHeader>\n");
183                     writer.write("  <raw><![CDATA[");
184                     writer.write(title);
185                     if (!title.endsWith(".")) {
186                         writer.write(".");
187                     }
188                     writer.write("\n\n");
189                     writer.write(text);
190                     writer.write("]]></raw>\n");
191                     writer.write("</NAF>\n");
192                 }
193             }
194 
195             // Emit entities
196             final Map<String, String> entityLines = Maps.newHashMap();
197             for (final Statement stmt : model.filter(null, ITSRDF_TA_IDENT_REF, null)) {
198                 final IRI entity = (IRI) stmt.getSubject();
199                 final IRI reference = VF.createIRI(stmt.getObject().stringValue());
200                 final IRI context = VF.createIRI(model.filter(entity, NIF.REFERENCE_CONTEXT, null)
201                         .objectValue().stringValue());
202                 final String id = ids.get(context);
203                 final String text = model.filter(entity, NIF.ANCHOR_OF, null).objectLiteral()
204                         .stringValue();
205                 final int begin = getInt(model, entity, NIF.BEGIN_INDEX);
206                 final int end = getInt(model, entity, NIF.END_INDEX);
207                 entityLines.put(
208                         id + String.format("%04d", begin),
209                         String.format("%s\t%d\t%d\t%s\t%s", id, begin, end, text,
210                                 reference.stringValue()));
211             }
212             try (Writer writer = IO.utf8Writer(IO.buffer(IO.write(output + ".entities")))) {
213                 for (final String key : Ordering.natural().sortedCopy(entityLines.keySet())) {
214                     writer.append(entityLines.get(key));
215                     writer.append("\n");
216                 }
217             }
218 
219         } catch (final Throwable ex) {
220             // Display error information and terminate
221             CommandLine.fail(ex);
222         }
223     }
224 
225     private static String fixQuery(String string) {
226         if (string.equals("Famous German Poetry")) {
227             string = "famous German poetry";
228         } else if (string.equals("University of Edinburgh Research")) {
229             string = "University of Edinburgh research";
230         } else if (string.equals("Bridge Construction")) {
231             string = "bridge construction";
232         } else if (string.equals("Walk of fame stars")) {
233             string = "Walk of Fame stars";
234         } else if (string.equals("Invention of the internet")) {
235             string = "Invention of the Internet";
236         } else if (string.equals("Early Telecommunication Methods")) {
237             string = "early telecommunication methods";
238         } else if (string.equals("Famous Members of the Royal Navy")) {
239             string = "famous members of the Royal Navy";
240         } else if (string.equals("Nobel Prize Winning inventions")) {
241             string = "Nobel Prize winning inventions";
242         } else if (string.equals("Edward Teller &amp; Marie Curie")) {
243             string = "Edward Teller and Marie Curie";
244         } else if (string
245                 .equals("Computing Language for the programming of artificial intelligence")) {
246             string = "Computing Language for the programming of Artificial Intelligence";
247         } else if (string.equals("William Hearst Movie")) {
248             string = "William Hearst movie";
249         } else if (string.equals("Nazis confiscate / destroy art and literature")) {
250             string = "Nazis confiscate or destroy art and literature";
251         } else if (string.equals("Modern Physiology")) {
252             string = "modern Physiology";
253         } else if (string.equals("Aviation pioneers publications")) {
254             string = "Aviation pioneers' publications";
255         } else if (string.equals("Skinner's experiments with the Operant conditioning chamber")) {
256             string = "Skinner's experiments with the operant conditioning chamber";
257         } else if (string.equals("First woman who won a nobel prize")) {
258             string = "First woman who won a Nobel Prize";
259         }
260         return string;
261     }
262 
263     private static String fixDocument(final String string) {
264         final StringBuilder builder = new StringBuilder();
265         Matcher m = SPLIT_PATTERN.matcher(string);
266         int end = 0;
267         while (m.find()) {
268             builder.append(string.substring(end, m.start()));
269             end = m.end();
270             final char c = string.charAt(m.start());
271             if (c != '.' && c != ':') {
272                 if (LOGGER.isDebugEnabled()) {
273                     LOGGER.debug("Splitted '"
274                             + string.substring(Math.max(0, m.start() - 20), m.start() + 1)
275                             + " | "
276                             + string.substring(m.start() + 2,
277                                     Math.min(string.length(), m.end() + 20)));
278                 }
279                 builder.append(string.charAt(m.start()));
280                 builder.append(". ");
281                 builder.append(m.group().substring(1));
282                 // builder.setCharAt(m.start() + 1, '.');
283             } else {
284                 builder.append(m.group());
285             }
286         }
287         builder.append(string.substring(end));
288         m = REMOVE_PATTERN.matcher(builder);
289         while (m.find()) {
290             for (int i = m.start(); i < m.end(); ++i) {
291                 builder.setCharAt(i, ' ');
292             }
293         }
294         return builder.toString();
295     }
296 
297     private static int getInt(final QuadModel model, final Resource subject, final IRI property) {
298         for (final Value value : model.filter(subject, property, null).objects()) {
299             try {
300                 return ((Literal) value).intValue();
301             } catch (final Throwable ex) {
302                 LOGGER.error("Not an integer: " + value);
303             }
304         }
305         throw new IllegalArgumentException("Missing " + property + " for " + subject);
306     }
307 
308     private Yovisto() {
309     }
310 
311 }