1 package eu.fbk.dkm.pikes.rdf;
2
3 import java.io.File;
4 import java.io.InputStream;
5
6 import javax.xml.parsers.DocumentBuilder;
7 import javax.xml.parsers.DocumentBuilderFactory;
8 import javax.xml.transform.Transformer;
9 import javax.xml.transform.TransformerFactory;
10 import javax.xml.transform.dom.DOMSource;
11 import javax.xml.transform.stream.StreamResult;
12
13 import org.w3c.dom.Document;
14 import org.w3c.dom.Element;
15 import org.w3c.dom.NodeList;
16
17 import eu.fbk.rdfpro.util.IO;
18
19 public class SemaforExtractor {
20
21 public static void main(final String... args) throws Throwable {
22
23 final DocumentBuilderFactory factory = DocumentBuilderFactory.newInstance();
24 final DocumentBuilder builder = factory.newDocumentBuilder();
25
26 final TransformerFactory transformerFactory = TransformerFactory.newInstance();
27 final Transformer transformer = transformerFactory.newTransformer();
28
29 for (final String arg : args) {
30 final File dir = new File(arg);
31 for (final File file : dir.listFiles()) {
32 if (file.getName().endsWith(".out")) {
33 System.out.println("Procesing " + arg);
34 try (InputStream stream = IO.read(file.getAbsolutePath())) {
35 final Document document = builder.parse(stream);
36 process(document);
37 final DOMSource source = new DOMSource(document);
38 final StreamResult result = new StreamResult(new File(file
39 .getAbsolutePath().replace(".out", ".xml")));
40 transformer.transform(source, result);
41 }
42 }
43 }
44 }
45 }
46
47 private static void process(final Document document) {
48 final String text = document.getElementsByTagName("text").item(0).getTextContent();
49 System.out.println(text);
50 final NodeList list = document.getElementsByTagName("label");
51 for (int i = 0; i < list.getLength(); ++i) {
52 final Element element = (Element) list.item(i);
53 final int start = Integer.parseInt(element.getAttribute("start"));
54 final int end = Integer.parseInt(element.getAttribute("end"));
55 final String span = text.substring(start, end + 1);
56 element.setAttribute("span", span);
57 }
58 }
59
60 }