1   package eu.fbk.dkm.pikes;
2   
3   import com.google.common.collect.Lists;
4   import com.google.common.collect.Sets;
5   import com.google.common.io.Files;
6   import eu.fbk.dkm.pikes.rdf.naf.NAFExtractor;
7   import eu.fbk.dkm.pikes.rdf.vocab.*;
8   import eu.fbk.rdfpro.RDFSources;
9   import eu.fbk.rdfpro.util.IO;
10  import eu.fbk.rdfpro.util.Statements;
11  import eu.fbk.utils.core.CommandLine;
12  import ixa.kaflib.KAFDocument;
13  import org.eclipse.rdf4j.model.Model;
14  import org.eclipse.rdf4j.model.Namespace;
15  import org.eclipse.rdf4j.model.Statement;
16  import org.eclipse.rdf4j.model.impl.LinkedHashModel;
17  import org.eclipse.rdf4j.model.impl.SimpleNamespace;
18  import org.eclipse.rdf4j.model.vocabulary.*;
19  import org.eclipse.rdf4j.rio.RDFFormat;
20  import org.eclipse.rdf4j.rio.RDFWriter;
21  import org.eclipse.rdf4j.rio.Rio;
22  import org.slf4j.LoggerFactory;
23  
24  import java.io.File;
25  import java.io.IOException;
26  import java.io.OutputStream;
27  import java.io.Reader;
28  import java.util.Collections;
29  import java.util.List;
30  import java.util.Set;
31  
32  public class NewRDFGeneratorTest {
33  
34  
35      final static String DEFAULT_PATH_INPUT = "/Users/marcorospocher/Downloads/pikes-kem-ud/input-naf";
36      final static String DEFAULT_PATH_OUTPUT = "/Users/marcorospocher/Downloads/pikes-kem-ud/output-rdf";
37  
38      public static void main(final String... args) {
39  
40  
41          final CommandLine cmd = CommandLine
42                  .parser()
43                  .withName("stripNAF")
44                  .withHeader("Strip NAF files of unnecessary layers")
45                  .withOption("i", "input",
46                          String.format("input folder (default %s)", DEFAULT_PATH_INPUT), "FOLDER",
47                          CommandLine.Type.DIRECTORY_EXISTING, true, false, false)
48                  .withOption("o", "output",
49                          String.format("output folder (default %s)", DEFAULT_PATH_OUTPUT), "FOLDER",
50                          CommandLine.Type.DIRECTORY_EXISTING, true, false, false)
51                  .withLogger(LoggerFactory.getLogger("eu.fbk")).parse(args);
52  
53  
54          // Input/output
55          File inputFolder = new File(DEFAULT_PATH_INPUT);
56          if (cmd.hasOption("input")) {
57              inputFolder = cmd.getOptionValue("input", File.class);
58          }
59  
60          File outputFolder = new File(DEFAULT_PATH_OUTPUT);
61          if (cmd.hasOption("output")) {
62              inputFolder = cmd.getOptionValue("output", File.class);
63          }
64  
65  
66          for (final File file : Files.fileTreeTraverser().preOrderTraversal(inputFolder)) {
67              if (!file.isFile()) {
68                  continue;
69              }
70              if (file.getName().startsWith(".")) {
71                  continue;
72              }
73  
74              if ((!file.getName().endsWith(".naf.gz")) && (!file.getName().endsWith(".naf"))) {
75                  continue;
76              }
77  
78              //System.out.print("Processing: "+file.getAbsoluteFile().toString());
79              String outputFileName=file.getAbsoluteFile().toString().replace(inputFolder.getAbsolutePath(), outputFolder.getAbsolutePath())+".trig";
80              File outputFile = new File(outputFileName);
81  
82  
83              if (!outputFile.exists()) {
84  
85                  try (Reader reader = IO.utf8Reader(IO.buffer(IO.read(file.getAbsoluteFile().toString())))) {
86                      try {
87  
88                     //System.out.print(" WORKING");
89  //                        Reading NAF
90                          final KAFDocument document = KAFDocument.createFromStream(reader);
91                          reader.close();
92  
93                          final Model model = new LinkedHashModel();
94  
95                          NAFExtractor extractor= NAFExtractor.builder().build();
96  
97                          extractor.generate(document,model,null);
98  
99                          OutputStream outputstream = IO.buffer(IO.write(outputFile.getAbsolutePath()));
100 
101                         writeGraph(model, outputstream, outputFileName);
102 
103 
104                     } catch (IOException e) {
105                         e.printStackTrace();
106                     } catch (Exception e) {
107                         e.printStackTrace();
108                     }
109                 } catch (IOException e) {
110                     e.printStackTrace();
111                 }
112             }
113         }
114     }
115 
116 
117  private static void writeGraph(final Model graph, final OutputStream stream,
118                                    final String fileName) throws IOException {
119 
120         final RDFFormat rdfFormat = Rio.getWriterFormatForFileName(fileName).get();
121         if (rdfFormat == null) {
122             throw new IOException("Unsupported RDF format for " + fileName);
123         }
124 
125         try {
126             final RDFWriter writer = Rio.createWriter(rdfFormat, stream);
127             final List<Statement> stmts = Lists.newArrayList(graph);
128             Collections.sort(stmts, Statements.statementComparator("spoc", //
129                     Statements.valueComparator(RDF.NAMESPACE)));
130             final Set<Namespace> namespaces = Sets.newLinkedHashSet(graph.getNamespaces());
131             namespaces.add(KS.NS);
132             namespaces.add(NIF.NS);
133             namespaces.add(DCTERMS.NS);
134             namespaces.add(OWLTIME.NS);
135             namespaces.add(XMLSchema.NS);
136             namespaces.add(OWL.NS); // not strictly necessary
137             namespaces.add(RDF.NS); // not strictly necessary
138             namespaces.add(RDFS.NS);
139             namespaces.add(KEM.NS);
140             namespaces.add(KEMT.NS);
141             namespaces.add(ITSRDF.NS);
142             namespaces.add(new SimpleNamespace("dbpedia", "http://dbpedia.org/resource/"));
143             namespaces.add(new SimpleNamespace("wn30", "http://wordnet-rdf.princeton.edu/wn30/"));
144             namespaces.add(new SimpleNamespace("sst", "http://pikes.fbk.eu/wn/sst/"));
145             namespaces.add(new SimpleNamespace("bbn", "http://pikes.fbk.eu/bbn/"));
146             namespaces.add(new SimpleNamespace("pm", "http://premon.fbk.eu/resource/"));
147             namespaces.add(new SimpleNamespace("ili", "http://sli.uvigo.gal/rdf_galnet/"));
148             namespaces.add(new SimpleNamespace("ner", "http://pikes.fbk.eu/ner/"));
149             namespaces.add(new SimpleNamespace("olia-penn-pos","http://purl.org/olia/penn.owl#"));
150             namespaces.add(new SimpleNamespace("olia-ud-pos","http://fginter.github.io/docs/u/pos/all.html#"));
151 
152             //add missing namespace http://premon.fbk.eu/resource/,  http://pikes.fbk.eu/ner/ http://lexvo.org/id/iso639-3/
153             RDFSources.wrap(stmts, namespaces).emit(writer, 1);
154         } catch (final Throwable ex) {
155             throw new IOException(ex);
156         }
157     }
158 
159 }
160 
161 
162 
163