1   package eu.fbk.dkm.pikes.resources.ecb;
2   
3   import com.google.common.io.Files;
4   import eu.fbk.utils.core.CommandLine;
5   import ixa.kaflib.KAFDocument;
6   import org.apache.commons.io.FileUtils;
7   import org.slf4j.Logger;
8   import org.slf4j.LoggerFactory;
9   import org.w3c.dom.Document;
10  
11  import javax.xml.parsers.DocumentBuilder;
12  import javax.xml.parsers.DocumentBuilderFactory;
13  import java.io.File;
14  import java.util.regex.Matcher;
15  import java.util.regex.Pattern;
16  
17  /**
18   * Created by marcorospocher on 12/03/16.
19   */
20  public class ECBparser {
21  
22  
23  
24      private static final Logger LOGGER = LoggerFactory.getLogger(ECBparser.class);
25      private static final String DEFAULT_PREFIX = "http://dkm.fbk.eu/pikes/dataset/ecb";
26  
27      public static void main(String[] args) {
28          try {
29              final CommandLine cmd = CommandLine
30                      .parser()
31                      .withName("./taol-extractor")
32                      .withHeader("Convert file from ecb annotated txt to NAF")
33                      .withOption("i", "input", "Input folder", "FOLDER",
34                              CommandLine.Type.DIRECTORY_EXISTING, true, false, true)
35                      .withOption("o", "output", "Output folder", "FOLDER",
36                              CommandLine.Type.DIRECTORY, true, false, true)
37                      .withOption("p", "prefix", String.format("Prefix (default %s)", DEFAULT_PREFIX), "PREFIX",
38                              CommandLine.Type.STRING, true, false, false)
39                      .withLogger(LoggerFactory.getLogger("eu.fbk")).parse(args);
40  
41              File inputFolder = cmd.getOptionValue("input", File.class);
42              File outputFolder = cmd.getOptionValue("output", File.class);
43              String prefix = cmd.getOptionValue("prefix", String.class, DEFAULT_PREFIX);
44  
45              if (!outputFolder.exists()) {
46                  outputFolder.mkdirs();
47              }
48  
49              DocumentBuilderFactory dbFactory = DocumentBuilderFactory.newInstance();
50              DocumentBuilder dBuilder = dbFactory.newDocumentBuilder();
51  
52              // uncomment to get the manual mention spans
53              //Pattern MY_PATTERN = Pattern.compile("\\\">[^<]*</MENTION>");
54  
55              String tags;
56  
57              int i = 0;
58              for (final File file : Files.fileTreeTraverser().preOrderTraversal(inputFolder)) {
59                  if (!file.isFile()) {
60                      continue;
61                  }
62                  if (file.getName().startsWith(".")) {
63                      continue;
64                  }
65  
66                  String path = file.getParentFile().toString();
67                  String folder = path.substring(path.lastIndexOf("/"));
68                  String local_name = folder+File.separator+file.getName();
69  
70                  //System.out.println(prefix+folder+File.separator+file.getName());
71  
72                  String url = prefix+local_name;
73                  String id = "" + i;
74                  String title = "";
75  
76                  String content = FileUtils.readFileToString(file, "utf-8");
77  
78                  // uncomment to get the manual mention spans
79                  //Matcher m = MY_PATTERN.matcher(content);
80                  //while (m.find()) System.out.println(m.group(0).replace("\">","").replace("</MENTION>",""));
81  
82                  content=content.replaceAll("\\<[^>]*>","");
83  
84                  File outputFile = new File(
85                          outputFolder.getAbsolutePath() + File.separator +
86                                  file.getAbsolutePath().substring(
87                                          inputFolder.getAbsolutePath().length()).replace(".ecb",".naf"));
88                  Files.createParentDirs(outputFile);
89  
90                  KAFDocument document = new KAFDocument("en", "v3");
91  
92                  KAFDocument.Public documentPublic = document.createPublic();
93                  documentPublic.uri = url;
94                  documentPublic.publicId = id;
95  
96                  KAFDocument.FileDesc documentFileDesc = document.createFileDesc();
97                  documentFileDesc.filename = local_name;
98                  documentFileDesc.title = title;
99  
100                 StringBuffer finalContent = new StringBuffer();
101 
102                 document.setRawText(content);
103 
104                 document.save(outputFile.getAbsolutePath());
105 
106 
107             }
108 
109         } catch (Exception e) {
110             CommandLine.fail(e);
111         }
112 
113     }
114 
115 }