1 package eu.fbk.dkm.pikes.resources.tempeval;
2
3 import com.google.common.io.Files;
4 import eu.fbk.utils.core.CommandLine;
5 import ixa.kaflib.KAFDocument;
6 import org.joox.JOOX;
7 import org.joox.Match;
8 import org.slf4j.Logger;
9 import org.slf4j.LoggerFactory;
10 import org.w3c.dom.Document;
11 import org.w3c.dom.Element;
12
13 import javax.xml.parsers.DocumentBuilder;
14 import javax.xml.parsers.DocumentBuilderFactory;
15 import java.io.File;
16
17
18
19
20
21 public class TMLtoNAF {
22
23 private static final Logger LOGGER = LoggerFactory.getLogger(TMLtoNAF.class);
24 private static final String DEFAULT_PREFIX = "http://tempeval3/";
25
26 public static void main(String[] args) {
27 try {
28 final CommandLine cmd = CommandLine
29 .parser()
30 .withName("./taol-extractor")
31 .withHeader("Convert file from Treccani XML to NAF")
32 .withOption("i", "input", "Input folder", "FOLDER",
33 CommandLine.Type.DIRECTORY_EXISTING, true, false, true)
34 .withOption("o", "output", "Output folder", "FOLDER",
35 CommandLine.Type.DIRECTORY, true, false, true)
36 .withOption("p", "prefix", String.format("Prefix (default %s)", DEFAULT_PREFIX), "PREFIX",
37 CommandLine.Type.STRING, true, false, false)
38 .withLogger(LoggerFactory.getLogger("eu.fbk")).parse(args);
39
40 File inputFolder = cmd.getOptionValue("input", File.class);
41 File outputFolder = cmd.getOptionValue("output", File.class);
42 String prefix = cmd.getOptionValue("prefix", String.class, DEFAULT_PREFIX);
43
44 if (!outputFolder.exists()) {
45 outputFolder.mkdirs();
46 }
47
48 DocumentBuilderFactory dbFactory = DocumentBuilderFactory.newInstance();
49 DocumentBuilder dBuilder = dbFactory.newDocumentBuilder();
50
51 int i = 0;
52 for (final File file : Files.fileTreeTraverser().preOrderTraversal(inputFolder)) {
53 if (!file.isFile()) {
54 continue;
55 }
56 if (file.getName().startsWith(".")) {
57 continue;
58 }
59
60 Document doc = dBuilder.parse(file);
61 doc.getDocumentElement().normalize();
62
63 String docID = null;
64 Match docidElements = JOOX.$(doc).find("DOCID");
65 for (Element docidElement : docidElements) {
66 docID = docidElement.getTextContent().trim();
67 }
68
69 if (docID == null) {
70 LOGGER.error("DOCID is null");
71 continue;
72 }
73
74 String url = prefix + docID;
75
76 String thisTimex = null;
77 Match docTimeElements = JOOX.$(doc).find("DCT").find("TIMEX3");
78
79 for (Element docTimeElement : docTimeElements) {
80 String function = docTimeElement.getAttribute("functionInDocument");
81 if (function == null) {
82 continue;
83 }
84 if (!function.equals("CREATION_TIME")) {
85 continue;
86 }
87
88 thisTimex = docTimeElement.getAttribute("value");
89 }
90
91 if (thisTimex == null) {
92 LOGGER.error("TIMEX3 is null");
93 continue;
94 }
95
96 String text = null;
97 Match textElements = JOOX.$(doc).find("TEXT");
98
99 for (Element textElement : textElements) {
100 text = textElement.getTextContent();
101 }
102
103 if (text == null) {
104 LOGGER.error("TEXT is null");
105 continue;
106 }
107
108 String fileName = outputFolder.getAbsolutePath() + File.separator + file.getAbsolutePath()
109 .substring(inputFolder.getAbsolutePath().length());
110 if (!fileName.endsWith("naf")) {
111 fileName += ".naf";
112 }
113 File outputFile = new File(fileName);
114 Files.createParentDirs(outputFile);
115
116 KAFDocument document = new KAFDocument("en", "v3");
117
118 KAFDocument.Public documentPublic = document.createPublic();
119 documentPublic.uri = url;
120 documentPublic.publicId = docID;
121
122 KAFDocument.FileDesc documentFileDesc = document.createFileDesc();
123 documentFileDesc.filename = file.getName();
124 documentFileDesc.title = docID;
125
126 document.setRawText(text);
127
128 document.save(outputFile);
129 }
130
131 } catch (Exception e) {
132 CommandLine.fail(e);
133 }
134 }
135 }