1   package eu.fbk.dkm.pikes.resources.treccani;
2   
3   import com.google.common.io.Files;
4   import eu.fbk.utils.core.CommandLine;
5   import ixa.kaflib.KAFDocument;
6   import org.apache.commons.lang.StringEscapeUtils;
7   import org.joox.JOOX;
8   import org.slf4j.Logger;
9   import org.slf4j.LoggerFactory;
10  import org.w3c.dom.Document;
11  import org.w3c.dom.Element;
12  
13  import javax.xml.parsers.DocumentBuilder;
14  import javax.xml.parsers.DocumentBuilderFactory;
15  import java.io.File;
16  
17  /**
18   * Created by alessio on 17/12/15.
19   */
20  
21  public class TAOL {
22  
23      private static final Logger LOGGER = LoggerFactory.getLogger(TAOL.class);
24      private static final String DEFAULT_PREFIX = "opencms://system/modules/com.atosorigin.treccani.bancadati.xml";
25  
26      public static void main(String[] args) {
27          try {
28              final CommandLine cmd = CommandLine
29                      .parser()
30                      .withName("./taol-extractor")
31                      .withHeader("Convert file from Treccani XML to NAF")
32                      .withOption("i", "input", "Input folder", "FOLDER",
33                              CommandLine.Type.DIRECTORY_EXISTING, true, false, true)
34                      .withOption("o", "output", "Output folder", "FOLDER",
35                              CommandLine.Type.DIRECTORY, true, false, true)
36                      .withOption("p", "prefix", String.format("Prefix (default %s)", DEFAULT_PREFIX), "PREFIX",
37                              CommandLine.Type.STRING, true, false, false)
38                      .withLogger(LoggerFactory.getLogger("eu.fbk")).parse(args);
39  
40              File inputFolder = cmd.getOptionValue("input", File.class);
41              File outputFolder = cmd.getOptionValue("output", File.class);
42              String prefix = cmd.getOptionValue("prefix", String.class, DEFAULT_PREFIX);
43  
44              if (!outputFolder.exists()) {
45                  outputFolder.mkdirs();
46              }
47  
48              DocumentBuilderFactory dbFactory = DocumentBuilderFactory.newInstance();
49              DocumentBuilder dBuilder = dbFactory.newDocumentBuilder();
50  
51              int i = 0;
52              for (final File file : Files.fileTreeTraverser().preOrderTraversal(inputFolder)) {
53                  if (!file.isFile()) {
54                      continue;
55                  }
56                  if (file.getName().startsWith(".")) {
57                      continue;
58                  }
59  
60                  Document doc = dBuilder.parse(file);
61                  doc.getDocumentElement().normalize();
62                  i++;
63  
64                  for (Element element : JOOX.$(doc).get()) {
65                      if (!element.getTagName().equals("ARTICOLI")) {
66                          continue;
67                      }
68  
69                      for (Element articolo : JOOX.$(element).find("ARTICOLO")) {
70                          String language = articolo.getAttribute("language");
71                          String thisPrefix = prefix + "/taol/" + language + "/";
72  
73                          String url = thisPrefix + i;
74                          String id = "" + i;
75                          String title = "";
76  
77                          Element cidaElement = JOOX.$(articolo).find("CIDA").get(0);
78                          if (cidaElement != null) {
79                              String cida = cidaElement.getTextContent().trim().replaceAll("\\s+", "");
80                              url = thisPrefix + cida;
81                              id = cida;
82                          }
83  
84                          Element ctitElement = JOOX.$(articolo).find("CTIT").get(0);
85                          if (ctitElement != null) {
86                              title = ctitElement.getTextContent().trim().replaceAll("\\s+", "");
87                          }
88  
89                          Element contentElement = JOOX.$(articolo).find("content").get(0);
90                          if (contentElement != null) {
91                              File outputFile = new File(
92                                      outputFolder.getAbsolutePath() + File.separator + language + File.separator +
93                                              file.getAbsolutePath().substring(
94                                                      inputFolder.getAbsolutePath().length()));
95                              Files.createParentDirs(outputFile);
96  
97                              KAFDocument document = new KAFDocument(language, "v3");
98  
99                              KAFDocument.Public documentPublic = document.createPublic();
100                             documentPublic.uri = url;
101                             documentPublic.publicId = id;
102 
103                             KAFDocument.FileDesc documentFileDesc = document.createFileDesc();
104                             documentFileDesc.filename = file.getName();
105                             documentFileDesc.title = title;
106 
107                             String content = contentElement.getTextContent();
108                             content = content.replaceAll("<br />", "\n");
109                             content = content.replaceAll(" +", " ");
110                             content = content.replaceAll("<[^>]+>", "");
111 
112                             StringBuffer finalContent = new StringBuffer();
113 
114                             String[] lines = content.split(System.getProperty("line.separator"));
115                             for (String line : lines) {
116                                 line = line.trim();
117                                 if (!line.matches(".*[.?!]+$") && line.length() != 0) {
118                                     line = line + ".";
119                                 }
120                                 if (line.startsWith("H1.") || line.startsWith("H2.")) {
121                                     line = line.substring(3).trim();
122                                 }
123 
124                                 finalContent.append(line).append("\n");
125                             }
126 
127                             String text = StringEscapeUtils.unescapeHtml(finalContent.toString());
128 
129                             document.setRawText(text);
130 
131                             document.save(outputFile.getAbsolutePath());
132                         }
133                     }
134                 }
135             }
136 
137 //            String serverUrl = cmd.getOptionValue("server", String.class);
138 //            File inputFile = cmd.getOptionValue("input", File.class);
139 //
140 //            URL url = new URL(serverUrl);
141 //            TintopServer server = new TintopServer(url);
142 //            TintopClient client = new TintopClient(server);
143 //
144 //            String whole = FileUtils.readFileToString(inputFile);
145 //            System.out.println(client.call(whole));
146         } catch (Exception e) {
147             CommandLine.fail(e);
148         }
149 
150     }
151 }