1 package eu.fbk.dkm.pikes;
2
3 import com.google.common.collect.Lists;
4 import com.google.common.collect.Sets;
5 import com.google.common.io.Files;
6 import eu.fbk.dkm.pikes.rdf.naf.NAFExtractor;
7 import eu.fbk.dkm.pikes.rdf.vocab.*;
8 import eu.fbk.rdfpro.RDFSources;
9 import eu.fbk.rdfpro.util.IO;
10 import eu.fbk.rdfpro.util.Statements;
11 import eu.fbk.utils.core.CommandLine;
12 import ixa.kaflib.KAFDocument;
13 import org.eclipse.rdf4j.model.Model;
14 import org.eclipse.rdf4j.model.Namespace;
15 import org.eclipse.rdf4j.model.Statement;
16 import org.eclipse.rdf4j.model.impl.LinkedHashModel;
17 import org.eclipse.rdf4j.model.impl.SimpleNamespace;
18 import org.eclipse.rdf4j.model.vocabulary.*;
19 import org.eclipse.rdf4j.rio.RDFFormat;
20 import org.eclipse.rdf4j.rio.RDFWriter;
21 import org.eclipse.rdf4j.rio.Rio;
22 import org.slf4j.LoggerFactory;
23
24 import java.io.File;
25 import java.io.IOException;
26 import java.io.OutputStream;
27 import java.io.Reader;
28 import java.util.Collections;
29 import java.util.List;
30 import java.util.Set;
31
32 public class NewRDFGeneratorTest {
33
34
35 final static String DEFAULT_PATH_INPUT = "/Users/marcorospocher/Downloads/pikes-kem-ud/input-naf";
36 final static String DEFAULT_PATH_OUTPUT = "/Users/marcorospocher/Downloads/pikes-kem-ud/output-rdf";
37
38 public static void main(final String... args) {
39
40
41 final CommandLine cmd = CommandLine
42 .parser()
43 .withName("stripNAF")
44 .withHeader("Strip NAF files of unnecessary layers")
45 .withOption("i", "input",
46 String.format("input folder (default %s)", DEFAULT_PATH_INPUT), "FOLDER",
47 CommandLine.Type.DIRECTORY_EXISTING, true, false, false)
48 .withOption("o", "output",
49 String.format("output folder (default %s)", DEFAULT_PATH_OUTPUT), "FOLDER",
50 CommandLine.Type.DIRECTORY_EXISTING, true, false, false)
51 .withLogger(LoggerFactory.getLogger("eu.fbk")).parse(args);
52
53
54
55 File inputFolder = new File(DEFAULT_PATH_INPUT);
56 if (cmd.hasOption("input")) {
57 inputFolder = cmd.getOptionValue("input", File.class);
58 }
59
60 File outputFolder = new File(DEFAULT_PATH_OUTPUT);
61 if (cmd.hasOption("output")) {
62 inputFolder = cmd.getOptionValue("output", File.class);
63 }
64
65
66 for (final File file : Files.fileTreeTraverser().preOrderTraversal(inputFolder)) {
67 if (!file.isFile()) {
68 continue;
69 }
70 if (file.getName().startsWith(".")) {
71 continue;
72 }
73
74 if ((!file.getName().endsWith(".naf.gz")) && (!file.getName().endsWith(".naf"))) {
75 continue;
76 }
77
78
79 String outputFileName=file.getAbsoluteFile().toString().replace(inputFolder.getAbsolutePath(), outputFolder.getAbsolutePath())+".trig";
80 File outputFile = new File(outputFileName);
81
82
83 if (!outputFile.exists()) {
84
85 try (Reader reader = IO.utf8Reader(IO.buffer(IO.read(file.getAbsoluteFile().toString())))) {
86 try {
87
88
89
90 final KAFDocument document = KAFDocument.createFromStream(reader);
91 reader.close();
92
93 final Model model = new LinkedHashModel();
94
95 NAFExtractor extractor= NAFExtractor.builder().build();
96
97 extractor.generate(document,model,null);
98
99 OutputStream outputstream = IO.buffer(IO.write(outputFile.getAbsolutePath()));
100
101 writeGraph(model, outputstream, outputFileName);
102
103
104 } catch (IOException e) {
105 e.printStackTrace();
106 } catch (Exception e) {
107 e.printStackTrace();
108 }
109 } catch (IOException e) {
110 e.printStackTrace();
111 }
112 }
113 }
114 }
115
116
117 private static void writeGraph(final Model graph, final OutputStream stream,
118 final String fileName) throws IOException {
119
120 final RDFFormat rdfFormat = Rio.getWriterFormatForFileName(fileName).get();
121 if (rdfFormat == null) {
122 throw new IOException("Unsupported RDF format for " + fileName);
123 }
124
125 try {
126 final RDFWriter writer = Rio.createWriter(rdfFormat, stream);
127 final List<Statement> stmts = Lists.newArrayList(graph);
128 Collections.sort(stmts, Statements.statementComparator("spoc",
129 Statements.valueComparator(RDF.NAMESPACE)));
130 final Set<Namespace> namespaces = Sets.newLinkedHashSet(graph.getNamespaces());
131 namespaces.add(KS.NS);
132 namespaces.add(NIF.NS);
133 namespaces.add(DCTERMS.NS);
134 namespaces.add(OWLTIME.NS);
135 namespaces.add(XMLSchema.NS);
136 namespaces.add(OWL.NS);
137 namespaces.add(RDF.NS);
138 namespaces.add(RDFS.NS);
139 namespaces.add(KEM.NS);
140 namespaces.add(KEMT.NS);
141 namespaces.add(ITSRDF.NS);
142 namespaces.add(new SimpleNamespace("dbpedia", "http://dbpedia.org/resource/"));
143 namespaces.add(new SimpleNamespace("wn30", "http://wordnet-rdf.princeton.edu/wn30/"));
144 namespaces.add(new SimpleNamespace("sst", "http://pikes.fbk.eu/wn/sst/"));
145 namespaces.add(new SimpleNamespace("bbn", "http://pikes.fbk.eu/bbn/"));
146 namespaces.add(new SimpleNamespace("pm", "http://premon.fbk.eu/resource/"));
147 namespaces.add(new SimpleNamespace("ili", "http://sli.uvigo.gal/rdf_galnet/"));
148 namespaces.add(new SimpleNamespace("ner", "http://pikes.fbk.eu/ner/"));
149 namespaces.add(new SimpleNamespace("olia-penn-pos","http://purl.org/olia/penn.owl#"));
150 namespaces.add(new SimpleNamespace("olia-ud-pos","http://fginter.github.io/docs/u/pos/all.html#"));
151
152
153 RDFSources.wrap(stmts, namespaces).emit(writer, 1);
154 } catch (final Throwable ex) {
155 throw new IOException(ex);
156 }
157 }
158
159 }
160
161
162
163