1 package eu.fbk.dkm.pikes.resources.treccani;
2
3 import com.google.common.io.Files;
4 import eu.fbk.utils.core.CommandLine;
5 import ixa.kaflib.KAFDocument;
6 import org.apache.commons.lang.StringEscapeUtils;
7 import org.joox.JOOX;
8 import org.slf4j.Logger;
9 import org.slf4j.LoggerFactory;
10 import org.w3c.dom.Document;
11 import org.w3c.dom.Element;
12
13 import javax.xml.parsers.DocumentBuilder;
14 import javax.xml.parsers.DocumentBuilderFactory;
15 import java.io.File;
16
17
18
19
20
21 public class TAOL {
22
23 private static final Logger LOGGER = LoggerFactory.getLogger(TAOL.class);
24 private static final String DEFAULT_PREFIX = "opencms://system/modules/com.atosorigin.treccani.bancadati.xml";
25
26 public static void main(String[] args) {
27 try {
28 final CommandLine cmd = CommandLine
29 .parser()
30 .withName("./taol-extractor")
31 .withHeader("Convert file from Treccani XML to NAF")
32 .withOption("i", "input", "Input folder", "FOLDER",
33 CommandLine.Type.DIRECTORY_EXISTING, true, false, true)
34 .withOption("o", "output", "Output folder", "FOLDER",
35 CommandLine.Type.DIRECTORY, true, false, true)
36 .withOption("p", "prefix", String.format("Prefix (default %s)", DEFAULT_PREFIX), "PREFIX",
37 CommandLine.Type.STRING, true, false, false)
38 .withLogger(LoggerFactory.getLogger("eu.fbk")).parse(args);
39
40 File inputFolder = cmd.getOptionValue("input", File.class);
41 File outputFolder = cmd.getOptionValue("output", File.class);
42 String prefix = cmd.getOptionValue("prefix", String.class, DEFAULT_PREFIX);
43
44 if (!outputFolder.exists()) {
45 outputFolder.mkdirs();
46 }
47
48 DocumentBuilderFactory dbFactory = DocumentBuilderFactory.newInstance();
49 DocumentBuilder dBuilder = dbFactory.newDocumentBuilder();
50
51 int i = 0;
52 for (final File file : Files.fileTreeTraverser().preOrderTraversal(inputFolder)) {
53 if (!file.isFile()) {
54 continue;
55 }
56 if (file.getName().startsWith(".")) {
57 continue;
58 }
59
60 Document doc = dBuilder.parse(file);
61 doc.getDocumentElement().normalize();
62 i++;
63
64 for (Element element : JOOX.$(doc).get()) {
65 if (!element.getTagName().equals("ARTICOLI")) {
66 continue;
67 }
68
69 for (Element articolo : JOOX.$(element).find("ARTICOLO")) {
70 String language = articolo.getAttribute("language");
71 String thisPrefix = prefix + "/taol/" + language + "/";
72
73 String url = thisPrefix + i;
74 String id = "" + i;
75 String title = "";
76
77 Element cidaElement = JOOX.$(articolo).find("CIDA").get(0);
78 if (cidaElement != null) {
79 String cida = cidaElement.getTextContent().trim().replaceAll("\\s+", "");
80 url = thisPrefix + cida;
81 id = cida;
82 }
83
84 Element ctitElement = JOOX.$(articolo).find("CTIT").get(0);
85 if (ctitElement != null) {
86 title = ctitElement.getTextContent().trim().replaceAll("\\s+", "");
87 }
88
89 Element contentElement = JOOX.$(articolo).find("content").get(0);
90 if (contentElement != null) {
91 File outputFile = new File(
92 outputFolder.getAbsolutePath() + File.separator + language + File.separator +
93 file.getAbsolutePath().substring(
94 inputFolder.getAbsolutePath().length()));
95 Files.createParentDirs(outputFile);
96
97 KAFDocument document = new KAFDocument(language, "v3");
98
99 KAFDocument.Public documentPublic = document.createPublic();
100 documentPublic.uri = url;
101 documentPublic.publicId = id;
102
103 KAFDocument.FileDesc documentFileDesc = document.createFileDesc();
104 documentFileDesc.filename = file.getName();
105 documentFileDesc.title = title;
106
107 String content = contentElement.getTextContent();
108 content = content.replaceAll("<br />", "\n");
109 content = content.replaceAll(" +", " ");
110 content = content.replaceAll("<[^>]+>", "");
111
112 StringBuffer finalContent = new StringBuffer();
113
114 String[] lines = content.split(System.getProperty("line.separator"));
115 for (String line : lines) {
116 line = line.trim();
117 if (!line.matches(".*[.?!]+$") && line.length() != 0) {
118 line = line + ".";
119 }
120 if (line.startsWith("H1.") || line.startsWith("H2.")) {
121 line = line.substring(3).trim();
122 }
123
124 finalContent.append(line).append("\n");
125 }
126
127 String text = StringEscapeUtils.unescapeHtml(finalContent.toString());
128
129 document.setRawText(text);
130
131 document.save(outputFile.getAbsolutePath());
132 }
133 }
134 }
135 }
136
137
138
139
140
141
142
143
144
145
146 } catch (Exception e) {
147 CommandLine.fail(e);
148 }
149
150 }
151 }