1   package eu.fbk.dkm.pikes.resources.ecb;
2   
3   import eu.fbk.utils.core.CommandLine;
4   import eu.fbk.utils.core.IO;
5   import ixa.kaflib.KAFDocument;
6   import org.slf4j.Logger;
7   import org.slf4j.LoggerFactory;
8   import org.w3c.dom.Document;
9   import org.w3c.dom.Element;
10  import org.w3c.dom.Node;
11  import org.w3c.dom.NodeList;
12  
13  import javax.xml.parsers.DocumentBuilder;
14  import javax.xml.parsers.DocumentBuilderFactory;
15  import javax.xml.xpath.XPath;
16  import javax.xml.xpath.XPathConstants;
17  import javax.xml.xpath.XPathExpression;
18  import javax.xml.xpath.XPathFactory;
19  import java.io.File;
20  import java.util.regex.Matcher;
21  import java.util.regex.Pattern;
22  
23  /**
24   * Created by alessio on 21/09/16.
25   */
26  
27  public class ConvertECBPlus {
28  
29      private static final Logger LOGGER = LoggerFactory.getLogger(ConvertECBPlus.class);
30      private static Pattern folderPattern = Pattern.compile("^([0-9]+)");
31  
32      public static void main(String[] args) {
33          final CommandLine cmd = CommandLine
34                  .parser()
35                  .withName("convert-ecb-plus")
36                  .withHeader("Convert ECB+ files to NAF")
37                  .withOption("i", "input-path", "the base path of the corpus", "DIR",
38                          CommandLine.Type.DIRECTORY_EXISTING, true, false, true)
39                  .withOption("o", "output-path", "output NAF folder", "DIR",
40                          CommandLine.Type.DIRECTORY, true, false, true)
41                  .withLogger(LoggerFactory.getLogger("eu.fbk")).parse(args);
42  
43          final File inputPath = cmd.getOptionValue("i", File.class);
44          final File outputPath = cmd.getOptionValue("o", File.class);
45  
46          boolean opMkDirs = outputPath.mkdirs();
47          if (!opMkDirs) {
48              LOGGER.error("Unable to create folder {}", outputPath.getAbsolutePath());
49          }
50  
51          File[] files = inputPath.listFiles();
52          for (File file : files) {
53              if (!file.isDirectory()) {
54                  continue;
55              }
56  
57              File[] thisFolderFiles = file.listFiles();
58              for (File nafFile : thisFolderFiles) {
59                  if (!nafFile.isFile()) {
60                      continue;
61                  }
62                  if (!nafFile.getAbsolutePath().endsWith(".xml")) {
63                      continue;
64                  }
65  
66                  String relativeFilePath = nafFile.getAbsolutePath().substring(inputPath.getAbsolutePath().length());
67                  if (relativeFilePath.startsWith(File.separator)) {
68                      relativeFilePath = relativeFilePath.substring(1);
69                  }
70  
71                  try {
72                      KAFDocument document = new KAFDocument("en", "FBK");
73  
74                      DocumentBuilderFactory dbFactory = DocumentBuilderFactory.newInstance();
75                      DocumentBuilder dBuilder = dbFactory.newDocumentBuilder();
76                      XPathFactory xPathfactory = XPathFactory.newInstance();
77                      XPath xpath = xPathfactory.newXPath();
78  
79                      XPathExpression expr;
80                      NodeList nl;
81  
82                      Document doc = dBuilder.parse(IO.read(nafFile.getAbsolutePath()));
83                      doc.getDocumentElement().normalize();
84  
85                      // Normalization rules
86                      expr = xpath.compile("/Document/token");
87                      nl = (NodeList) expr.evaluate(doc, XPathConstants.NODESET);
88  
89                      StringBuffer buffer = new StringBuffer();
90                      StringBuffer text = new StringBuffer();
91                      int lastSent = 0;
92                      for (int i = 0; i < nl.getLength(); i++) {
93                          Node item = nl.item(i);
94                          Element element = (Element) item;
95  
96                          int sentence = Integer.parseInt(element.getAttribute("sentence"));
97                          if (relativeFilePath.contains("ecbplus") && sentence == 0) {
98                              continue;
99                          }
100                         if (sentence != lastSent) {
101                             if (buffer.length() > 0) {
102                                 text.append(buffer.toString().trim()).append("\n");
103                             }
104                             buffer = new StringBuffer();
105                             lastSent = sentence;
106                         }
107 
108                         buffer.append(element.getTextContent()).append(" ");
109                     }
110                     if (buffer.length() > 0) {
111                         text.append(buffer.toString().trim()).append("\n");
112                     }
113 
114                     document.setRawText(text.toString().trim());
115                     KAFDocument.Public aPublic = document.createPublic();
116                     aPublic.uri = "http://ecbplus/" + relativeFilePath;
117                     aPublic.publicId = relativeFilePath;
118                     KAFDocument.FileDesc fileDesc = document.createFileDesc();
119                     fileDesc.title = "";
120 
121                     Matcher matcher = folderPattern.matcher(relativeFilePath);
122                     if (matcher.find()) {
123                         String folderID = matcher.group(1);
124                         File newFolder = new File(outputPath + File.separator + folderID);
125                         newFolder.mkdirs();
126                     }
127 
128                     File outputFile = new File(outputPath + File.separator + relativeFilePath + ".naf");
129                     document.save(outputFile);
130                 } catch (Exception e) {
131                     e.printStackTrace();
132                 }
133 
134             }
135 
136         }
137     }
138 }