1 package eu.fbk.dkm.pikes.resources.ecb;
2
3 import com.google.common.io.Files;
4 import eu.fbk.utils.core.CommandLine;
5 import ixa.kaflib.KAFDocument;
6 import org.apache.commons.io.FileUtils;
7 import org.slf4j.Logger;
8 import org.slf4j.LoggerFactory;
9 import org.w3c.dom.Document;
10
11 import javax.xml.parsers.DocumentBuilder;
12 import javax.xml.parsers.DocumentBuilderFactory;
13 import java.io.File;
14 import java.util.regex.Matcher;
15 import java.util.regex.Pattern;
16
17
18
19
20 public class ECBparser {
21
22
23
24 private static final Logger LOGGER = LoggerFactory.getLogger(ECBparser.class);
25 private static final String DEFAULT_PREFIX = "http://dkm.fbk.eu/pikes/dataset/ecb";
26
27 public static void main(String[] args) {
28 try {
29 final CommandLine cmd = CommandLine
30 .parser()
31 .withName("./taol-extractor")
32 .withHeader("Convert file from ecb annotated txt to NAF")
33 .withOption("i", "input", "Input folder", "FOLDER",
34 CommandLine.Type.DIRECTORY_EXISTING, true, false, true)
35 .withOption("o", "output", "Output folder", "FOLDER",
36 CommandLine.Type.DIRECTORY, true, false, true)
37 .withOption("p", "prefix", String.format("Prefix (default %s)", DEFAULT_PREFIX), "PREFIX",
38 CommandLine.Type.STRING, true, false, false)
39 .withLogger(LoggerFactory.getLogger("eu.fbk")).parse(args);
40
41 File inputFolder = cmd.getOptionValue("input", File.class);
42 File outputFolder = cmd.getOptionValue("output", File.class);
43 String prefix = cmd.getOptionValue("prefix", String.class, DEFAULT_PREFIX);
44
45 if (!outputFolder.exists()) {
46 outputFolder.mkdirs();
47 }
48
49 DocumentBuilderFactory dbFactory = DocumentBuilderFactory.newInstance();
50 DocumentBuilder dBuilder = dbFactory.newDocumentBuilder();
51
52
53
54
55 String tags;
56
57 int i = 0;
58 for (final File file : Files.fileTreeTraverser().preOrderTraversal(inputFolder)) {
59 if (!file.isFile()) {
60 continue;
61 }
62 if (file.getName().startsWith(".")) {
63 continue;
64 }
65
66 String path = file.getParentFile().toString();
67 String folder = path.substring(path.lastIndexOf("/"));
68 String local_name = folder+File.separator+file.getName();
69
70
71
72 String url = prefix+local_name;
73 String id = "" + i;
74 String title = "";
75
76 String content = FileUtils.readFileToString(file, "utf-8");
77
78
79
80
81
82 content=content.replaceAll("\\<[^>]*>","");
83
84 File outputFile = new File(
85 outputFolder.getAbsolutePath() + File.separator +
86 file.getAbsolutePath().substring(
87 inputFolder.getAbsolutePath().length()).replace(".ecb",".naf"));
88 Files.createParentDirs(outputFile);
89
90 KAFDocument document = new KAFDocument("en", "v3");
91
92 KAFDocument.Public documentPublic = document.createPublic();
93 documentPublic.uri = url;
94 documentPublic.publicId = id;
95
96 KAFDocument.FileDesc documentFileDesc = document.createFileDesc();
97 documentFileDesc.filename = local_name;
98 documentFileDesc.title = title;
99
100 StringBuffer finalContent = new StringBuffer();
101
102 document.setRawText(content);
103
104 document.save(outputFile.getAbsolutePath());
105
106
107 }
108
109 } catch (Exception e) {
110 CommandLine.fail(e);
111 }
112
113 }
114
115 }