1   package eu.fbk.dkm.pikes.resources.tempeval;
2   
3   import com.google.common.io.Files;
4   import eu.fbk.utils.core.CommandLine;
5   import ixa.kaflib.KAFDocument;
6   import org.joox.JOOX;
7   import org.joox.Match;
8   import org.slf4j.Logger;
9   import org.slf4j.LoggerFactory;
10  import org.w3c.dom.Document;
11  import org.w3c.dom.Element;
12  
13  import javax.xml.parsers.DocumentBuilder;
14  import javax.xml.parsers.DocumentBuilderFactory;
15  import java.io.File;
16  
17  /**
18   * Created by alessio on 05/02/16.
19   */
20  
21  public class TMLtoNAF {
22  
23      private static final Logger LOGGER = LoggerFactory.getLogger(TMLtoNAF.class);
24      private static final String DEFAULT_PREFIX = "http://tempeval3/";
25  
26      public static void main(String[] args) {
27          try {
28              final CommandLine cmd = CommandLine
29                      .parser()
30                      .withName("./taol-extractor")
31                      .withHeader("Convert file from Treccani XML to NAF")
32                      .withOption("i", "input", "Input folder", "FOLDER",
33                              CommandLine.Type.DIRECTORY_EXISTING, true, false, true)
34                      .withOption("o", "output", "Output folder", "FOLDER",
35                              CommandLine.Type.DIRECTORY, true, false, true)
36                      .withOption("p", "prefix", String.format("Prefix (default %s)", DEFAULT_PREFIX), "PREFIX",
37                              CommandLine.Type.STRING, true, false, false)
38                      .withLogger(LoggerFactory.getLogger("eu.fbk")).parse(args);
39  
40              File inputFolder = cmd.getOptionValue("input", File.class);
41              File outputFolder = cmd.getOptionValue("output", File.class);
42              String prefix = cmd.getOptionValue("prefix", String.class, DEFAULT_PREFIX);
43  
44              if (!outputFolder.exists()) {
45                  outputFolder.mkdirs();
46              }
47  
48              DocumentBuilderFactory dbFactory = DocumentBuilderFactory.newInstance();
49              DocumentBuilder dBuilder = dbFactory.newDocumentBuilder();
50  
51              int i = 0;
52              for (final File file : Files.fileTreeTraverser().preOrderTraversal(inputFolder)) {
53                  if (!file.isFile()) {
54                      continue;
55                  }
56                  if (file.getName().startsWith(".")) {
57                      continue;
58                  }
59  
60                  Document doc = dBuilder.parse(file);
61                  doc.getDocumentElement().normalize();
62  
63                  String docID = null;
64                  Match docidElements = JOOX.$(doc).find("DOCID");
65                  for (Element docidElement : docidElements) {
66                      docID = docidElement.getTextContent().trim();
67                  }
68  
69                  if (docID == null) {
70                      LOGGER.error("DOCID is null");
71                      continue;
72                  }
73  
74                  String url = prefix + docID;
75  
76                  String thisTimex = null;
77                  Match docTimeElements = JOOX.$(doc).find("DCT").find("TIMEX3");
78  
79                  for (Element docTimeElement : docTimeElements) {
80                      String function = docTimeElement.getAttribute("functionInDocument");
81                      if (function == null) {
82                          continue;
83                      }
84                      if (!function.equals("CREATION_TIME")) {
85                          continue;
86                      }
87  
88                      thisTimex = docTimeElement.getAttribute("value");
89                  }
90  
91                  if (thisTimex == null) {
92                      LOGGER.error("TIMEX3 is null");
93                      continue;
94                  }
95  
96                  String text = null;
97                  Match textElements = JOOX.$(doc).find("TEXT");
98  
99                  for (Element textElement : textElements) {
100                     text = textElement.getTextContent();
101                 }
102 
103                 if (text == null) {
104                     LOGGER.error("TEXT is null");
105                     continue;
106                 }
107 
108                 String fileName = outputFolder.getAbsolutePath() + File.separator + file.getAbsolutePath()
109                         .substring(inputFolder.getAbsolutePath().length());
110                 if (!fileName.endsWith("naf")) {
111                     fileName += ".naf";
112                 }
113                 File outputFile = new File(fileName);
114                 Files.createParentDirs(outputFile);
115 
116                 KAFDocument document = new KAFDocument("en", "v3");
117 
118                 KAFDocument.Public documentPublic = document.createPublic();
119                 documentPublic.uri = url;
120                 documentPublic.publicId = docID;
121 
122                 KAFDocument.FileDesc documentFileDesc = document.createFileDesc();
123                 documentFileDesc.filename = file.getName();
124                 documentFileDesc.title = docID;
125 
126                 document.setRawText(text);
127 
128                 document.save(outputFile);
129             }
130 
131         } catch (Exception e) {
132             CommandLine.fail(e);
133         }
134     }
135 }