1   package eu.fbk.dkm.pikes.naflib;
2   
3   import com.google.common.io.Files;
4   import eu.fbk.utils.core.CommandLine;
5   import eu.fbk.rdfpro.util.IO;
6   import ixa.kaflib.KAFDocument;
7   import org.slf4j.LoggerFactory;
8   
9   import java.io.*;
10  import java.text.SimpleDateFormat;
11  
12  /**
13   * Created by marcorospocher on 19/07/16.
14   */
15  public class StripNAF {
16  
17      private static final org.slf4j.Logger LOGGER = LoggerFactory.getLogger(StripNAF.class);
18      private static SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd'T'HH:mm:ss.SSSZ");
19  
20  
21      public enum removeLayer {
22          deps, chunks, entities, properties, categories, coreferences, opinions, relations, srl, constituency, timeExpressions, linkedEntities, constituencyStrings;
23      }
24  
25      public static void main(String[] args) {
26          try {
27              final CommandLine cmd = CommandLine
28                      .parser()
29                      .withName("stripNAF")
30                      .withHeader("Strip NAF files of unnecessary layers")
31                      .withOption("i", "input-folder", "the folder of the input NAF corpus", "DIR", CommandLine.Type.DIRECTORY_EXISTING, true, false, true)
32                      .withOption("o", "output-folder", "the folder of the input NAF corpus", "DIR", CommandLine.Type.DIRECTORY, true, false, true)
33                      .withLogger(LoggerFactory.getLogger("eu.fbk")).parse(args);
34  
35              File inputFolder = cmd.getOptionValue("input-folder", File.class);
36              File outputFolder = cmd.getOptionValue("output-folder", File.class);
37  
38              for (final File file : Files.fileTreeTraverser().preOrderTraversal(inputFolder)) {
39                  if (!file.isFile()) {
40                      continue;
41                  }
42                  if (file.getName().startsWith(".")) {
43                      continue;
44                  }
45  
46                  if (!file.getName().endsWith(".naf.gz")) {
47                      continue;
48                  }
49  
50                  //System.out.print("Processing: "+file.getAbsoluteFile().toString());
51                  File outputFile = new File(file.getAbsoluteFile().toString().replace(inputFolder.getAbsolutePath(),outputFolder.getAbsolutePath()));
52  
53                  if (!outputFile.exists()) {
54  
55                      try (Reader reader = IO.utf8Reader(IO.buffer(IO.read(file.getAbsoluteFile().toString())))) {
56                          try {
57  
58                              //System.out.print(" WORKING");
59  
60                              KAFDocument document = KAFDocument.createFromStream(reader);
61                              reader.close();
62  
63                              //System.out.println("Processing: "+file.getAbsoluteFile().toString());
64  
65                              for (removeLayer layer : removeLayer.values()) {
66                                  document.removeLayer(KAFDocument.Layer.valueOf(layer.toString()));
67                              }
68  
69                              Files.createParentDirs(outputFile);
70                              try (Writer w = IO.utf8Writer(IO.buffer(IO.write(outputFile.getAbsolutePath())))) {
71                                  w.write(document.toString());
72                                  w.close();
73                                  //System.out.print(" SAVED");
74  
75                              }
76  
77                              System.out.println("");
78  
79                          } catch (Exception e) {
80  
81                          }
82  
83                      }
84                  } //else System.out.println(" SKIPPED");
85  
86              }
87          } catch (FileNotFoundException e) {
88              e.printStackTrace();
89          } catch (IOException e) {
90              e.printStackTrace();
91          }
92      }
93  }