1 package eu.fbk.dkm.pikes.rdf; 2 3 import java.io.File; 4 import java.io.InputStream; 5 6 import javax.xml.parsers.DocumentBuilder; 7 import javax.xml.parsers.DocumentBuilderFactory; 8 import javax.xml.transform.Transformer; 9 import javax.xml.transform.TransformerFactory; 10 import javax.xml.transform.dom.DOMSource; 11 import javax.xml.transform.stream.StreamResult; 12 13 import org.w3c.dom.Document; 14 import org.w3c.dom.Element; 15 import org.w3c.dom.NodeList; 16 17 import eu.fbk.rdfpro.util.IO; 18 19 public class SemaforExtractor { 20 21 public static void main(final String... args) throws Throwable { 22 23 final DocumentBuilderFactory factory = DocumentBuilderFactory.newInstance(); 24 final DocumentBuilder builder = factory.newDocumentBuilder(); 25 26 final TransformerFactory transformerFactory = TransformerFactory.newInstance(); 27 final Transformer transformer = transformerFactory.newTransformer(); 28 29 for (final String arg : args) { 30 final File dir = new File(arg); 31 for (final File file : dir.listFiles()) { 32 if (file.getName().endsWith(".out")) { 33 System.out.println("Procesing " + arg); 34 try (InputStream stream = IO.read(file.getAbsolutePath())) { 35 final Document document = builder.parse(stream); 36 process(document); 37 final DOMSource source = new DOMSource(document); 38 final StreamResult result = new StreamResult(new File(file 39 .getAbsolutePath().replace(".out", ".xml"))); 40 transformer.transform(source, result); 41 } 42 } 43 } 44 } 45 } 46 47 private static void process(final Document document) { 48 final String text = document.getElementsByTagName("text").item(0).getTextContent(); 49 System.out.println(text); 50 final NodeList list = document.getElementsByTagName("label"); 51 for (int i = 0; i < list.getLength(); ++i) { 52 final Element element = (Element) list.item(i); 53 final int start = Integer.parseInt(element.getAttribute("start")); 54 final int end = Integer.parseInt(element.getAttribute("end")); 55 final String span = text.substring(start, end + 1); 56 element.setAttribute("span", span); 57 } 58 } 59 60 }