1 package eu.fbk.dkm.pikes.resources.ecb;
2
3 import eu.fbk.utils.core.CommandLine;
4 import eu.fbk.utils.core.IO;
5 import ixa.kaflib.KAFDocument;
6 import org.slf4j.Logger;
7 import org.slf4j.LoggerFactory;
8 import org.w3c.dom.Document;
9 import org.w3c.dom.Element;
10 import org.w3c.dom.Node;
11 import org.w3c.dom.NodeList;
12
13 import javax.xml.parsers.DocumentBuilder;
14 import javax.xml.parsers.DocumentBuilderFactory;
15 import javax.xml.xpath.XPath;
16 import javax.xml.xpath.XPathConstants;
17 import javax.xml.xpath.XPathExpression;
18 import javax.xml.xpath.XPathFactory;
19 import java.io.File;
20 import java.util.regex.Matcher;
21 import java.util.regex.Pattern;
22
23
24
25
26
27 public class ConvertECBPlus {
28
29 private static final Logger LOGGER = LoggerFactory.getLogger(ConvertECBPlus.class);
30 private static Pattern folderPattern = Pattern.compile("^([0-9]+)");
31
32 public static void main(String[] args) {
33 final CommandLine cmd = CommandLine
34 .parser()
35 .withName("convert-ecb-plus")
36 .withHeader("Convert ECB+ files to NAF")
37 .withOption("i", "input-path", "the base path of the corpus", "DIR",
38 CommandLine.Type.DIRECTORY_EXISTING, true, false, true)
39 .withOption("o", "output-path", "output NAF folder", "DIR",
40 CommandLine.Type.DIRECTORY, true, false, true)
41 .withLogger(LoggerFactory.getLogger("eu.fbk")).parse(args);
42
43 final File inputPath = cmd.getOptionValue("i", File.class);
44 final File outputPath = cmd.getOptionValue("o", File.class);
45
46 boolean opMkDirs = outputPath.mkdirs();
47 if (!opMkDirs) {
48 LOGGER.error("Unable to create folder {}", outputPath.getAbsolutePath());
49 }
50
51 File[] files = inputPath.listFiles();
52 for (File file : files) {
53 if (!file.isDirectory()) {
54 continue;
55 }
56
57 File[] thisFolderFiles = file.listFiles();
58 for (File nafFile : thisFolderFiles) {
59 if (!nafFile.isFile()) {
60 continue;
61 }
62 if (!nafFile.getAbsolutePath().endsWith(".xml")) {
63 continue;
64 }
65
66 String relativeFilePath = nafFile.getAbsolutePath().substring(inputPath.getAbsolutePath().length());
67 if (relativeFilePath.startsWith(File.separator)) {
68 relativeFilePath = relativeFilePath.substring(1);
69 }
70
71 try {
72 KAFDocument document = new KAFDocument("en", "FBK");
73
74 DocumentBuilderFactory dbFactory = DocumentBuilderFactory.newInstance();
75 DocumentBuilder dBuilder = dbFactory.newDocumentBuilder();
76 XPathFactory xPathfactory = XPathFactory.newInstance();
77 XPath xpath = xPathfactory.newXPath();
78
79 XPathExpression expr;
80 NodeList nl;
81
82 Document doc = dBuilder.parse(IO.read(nafFile.getAbsolutePath()));
83 doc.getDocumentElement().normalize();
84
85
86 expr = xpath.compile("/Document/token");
87 nl = (NodeList) expr.evaluate(doc, XPathConstants.NODESET);
88
89 StringBuffer buffer = new StringBuffer();
90 StringBuffer text = new StringBuffer();
91 int lastSent = 0;
92 for (int i = 0; i < nl.getLength(); i++) {
93 Node item = nl.item(i);
94 Element element = (Element) item;
95
96 int sentence = Integer.parseInt(element.getAttribute("sentence"));
97 if (relativeFilePath.contains("ecbplus") && sentence == 0) {
98 continue;
99 }
100 if (sentence != lastSent) {
101 if (buffer.length() > 0) {
102 text.append(buffer.toString().trim()).append("\n");
103 }
104 buffer = new StringBuffer();
105 lastSent = sentence;
106 }
107
108 buffer.append(element.getTextContent()).append(" ");
109 }
110 if (buffer.length() > 0) {
111 text.append(buffer.toString().trim()).append("\n");
112 }
113
114 document.setRawText(text.toString().trim());
115 KAFDocument.Public aPublic = document.createPublic();
116 aPublic.uri = "http://ecbplus/" + relativeFilePath;
117 aPublic.publicId = relativeFilePath;
118 KAFDocument.FileDesc fileDesc = document.createFileDesc();
119 fileDesc.title = "";
120
121 Matcher matcher = folderPattern.matcher(relativeFilePath);
122 if (matcher.find()) {
123 String folderID = matcher.group(1);
124 File newFolder = new File(outputPath + File.separator + folderID);
125 newFolder.mkdirs();
126 }
127
128 File outputFile = new File(outputPath + File.separator + relativeFilePath + ".naf");
129 document.save(outputFile);
130 } catch (Exception e) {
131 e.printStackTrace();
132 }
133
134 }
135
136 }
137 }
138 }