1 package eu.fbk.dkm.pikes.resources.darmstadt;
2
3 import com.google.common.base.Charsets;
4 import com.google.common.io.Files;
5 import eu.fbk.rdfpro.util.Statements;
6 import eu.fbk.utils.core.CommandLine;
7 import ixa.kaflib.KAFDocument;
8 import org.apache.commons.io.FileUtils;
9 import org.apache.commons.io.FilenameUtils;
10 import org.slf4j.LoggerFactory;
11 import org.w3c.dom.Document;
12 import org.w3c.dom.Element;
13 import org.w3c.dom.NodeList;
14
15 import javax.xml.parsers.DocumentBuilder;
16 import javax.xml.parsers.DocumentBuilderFactory;
17 import java.io.ByteArrayInputStream;
18 import java.io.File;
19 import java.util.HashSet;
20 import java.util.Iterator;
21
22
23
24
25
26 public class CorpusLoader {
27
28 private static final org.slf4j.Logger LOGGER = LoggerFactory.getLogger(CorpusLoader.class);
29 public static final String DEFAULT_NAMESPACE = "https://www.ukp.tu-eu.fbk.dkm.pikes.resources.darmstadt.de/eu.fbk.dkm.pikes.resources.darmstadt-service-review-corpus/";
30 public static final String[] MMAX_PATTERN = new String[]{"basedata", "markables"};
31 public static final String[] MMAX_SUFFIXES = new String[]{"_words", "_OpinionExpression_level"};
32
33 private static void getFilesRecursive(File pFile, HashSet<String> folders) {
34 for (File file : pFile.listFiles()) {
35 if (file.isDirectory()) {
36 folders.add(file.getAbsolutePath());
37 getFilesRecursive(file, folders);
38 }
39 }
40 }
41
42 public static void main(String[] args) {
43 try {
44 final CommandLine cmd = CommandLine
45 .parser()
46 .withName("eu.fbk.dkm.pikes.resources.darmstadt-loader")
47 .withHeader("Load eu.fbk.dkm.pikes.resources.darmstadt-service-review-corpus")
48 .withOption("i", "input-folder", "the folder of the corpus", "DIR", CommandLine.Type.DIRECTORY_EXISTING, true, false, true)
49 .withOption("n", "namespace", String.format("the namespace for generating document URIs, default %s", DEFAULT_NAMESPACE), "NS", CommandLine.Type.STRING, true, false, false)
50 .withLogger(LoggerFactory.getLogger("eu.fbk.fssa")).parse(args);
51
52 final File inputFile = cmd.getOptionValue("i", File.class);
53
54 String namespace = DEFAULT_NAMESPACE;
55 if (cmd.hasOption("n")) {
56 namespace = cmd.getOptionValue("n", String.class);
57 }
58
59 DocumentBuilderFactory dbFactory = DocumentBuilderFactory.newInstance();
60 dbFactory.setValidating(false);
61 dbFactory.setNamespaceAware(true);
62 dbFactory.setFeature("http://xml.org/sax/features/namespaces", false);
63 dbFactory.setFeature("http://xml.org/sax/features/validation", false);
64 dbFactory.setFeature("http://apache.org/xml/features/nonvalidating/load-dtd-grammar", false);
65 dbFactory.setFeature("http://apache.org/xml/features/nonvalidating/load-external-dtd", false);
66
67 HashSet<String> folders = new HashSet<>();
68 getFilesRecursive(inputFile, folders);
69
70 HashSet<String> okFolders = new HashSet<>();
71 okLoop:
72 for (String folder : folders) {
73 for (String pattern : MMAX_PATTERN) {
74 StringBuffer newFolder = new StringBuffer();
75 newFolder.append(folder);
76 newFolder.append(File.separator);
77 newFolder.append(pattern);
78
79 if (!folders.contains(newFolder.toString())) {
80 continue okLoop;
81 }
82 }
83
84 okFolders.add(folder);
85 }
86
87 for (String folder : okFolders) {
88 LOGGER.info("Entering folder {}", folder);
89
90 String baseDataDir = folder + File.separator + MMAX_PATTERN[0];
91 File nafDir = new File(folder + File.separator + "naf");
92
93 if (nafDir.exists()) {
94 LOGGER.warn("{} dir exists", nafDir.getAbsolutePath());
95 }
96 else {
97 nafDir.mkdir();
98 }
99
100 Iterator<File> fileIterator;
101 fileIterator = FileUtils.iterateFiles(new File(baseDataDir), new String[]{"xml"}, false);
102 while (fileIterator.hasNext()) {
103 File file = fileIterator.next();
104 StringBuffer stringBuffer = new StringBuffer();
105 String fileBaseName = FilenameUtils.removeExtension(file.getName());
106 fileBaseName = fileBaseName.replaceAll(MMAX_SUFFIXES[0], "");
107
108 String fileContent = Files.toString(file, Charsets.UTF_8);
109
110
111 fileContent = fileContent.replaceAll("&", "&");
112
113 DocumentBuilder dBuilder = dbFactory.newDocumentBuilder();
114 Document doc = dBuilder.parse(new ByteArrayInputStream(fileContent.getBytes(Charsets.UTF_8)));
115 NodeList nList = doc.getElementsByTagName("word");
116 for (int temp = 0; temp < nList.getLength(); temp++) {
117 Element nNode = (Element) nList.item(temp);
118 stringBuffer.append(nNode.getTextContent().replaceAll("\\s+", ""));
119 stringBuffer.append(" ");
120 }
121
122 String nafFileName = fileBaseName + ".naf";
123 File nafFile = new File(nafDir.getAbsolutePath() + File.separator + nafFileName);
124 String text = stringBuffer.toString().trim();
125 String documentURI = namespace + nafFileName;
126
127 final KAFDocument document = new KAFDocument("en", "v3");
128 document.setRawText(text);
129 document.createPublic();
130 document.getPublic().publicId = Statements.VALUE_FACTORY.createIRI(documentURI).getLocalName();
131 document.getPublic().uri = documentURI;
132 document.createFileDesc();
133 document.getFileDesc().filename = nafFileName;
134 document.getFileDesc().title = "-";
135 document.save(nafFile.getAbsolutePath());
136 }
137 }
138
139 } catch (final Throwable ex) {
140 CommandLine.fail(ex);
141 }
142 }
143 }