1 package eu.fbk.dkm.pikes.naflib;
2
3 import com.google.common.io.Files;
4 import eu.fbk.utils.core.CommandLine;
5 import eu.fbk.rdfpro.util.IO;
6 import ixa.kaflib.KAFDocument;
7 import org.slf4j.LoggerFactory;
8
9 import java.io.*;
10 import java.text.SimpleDateFormat;
11
12
13
14
15 public class StripNAF {
16
17 private static final org.slf4j.Logger LOGGER = LoggerFactory.getLogger(StripNAF.class);
18 private static SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd'T'HH:mm:ss.SSSZ");
19
20
21 public enum removeLayer {
22 deps, chunks, entities, properties, categories, coreferences, opinions, relations, srl, constituency, timeExpressions, linkedEntities, constituencyStrings;
23 }
24
25 public static void main(String[] args) {
26 try {
27 final CommandLine cmd = CommandLine
28 .parser()
29 .withName("stripNAF")
30 .withHeader("Strip NAF files of unnecessary layers")
31 .withOption("i", "input-folder", "the folder of the input NAF corpus", "DIR", CommandLine.Type.DIRECTORY_EXISTING, true, false, true)
32 .withOption("o", "output-folder", "the folder of the input NAF corpus", "DIR", CommandLine.Type.DIRECTORY, true, false, true)
33 .withLogger(LoggerFactory.getLogger("eu.fbk")).parse(args);
34
35 File inputFolder = cmd.getOptionValue("input-folder", File.class);
36 File outputFolder = cmd.getOptionValue("output-folder", File.class);
37
38 for (final File file : Files.fileTreeTraverser().preOrderTraversal(inputFolder)) {
39 if (!file.isFile()) {
40 continue;
41 }
42 if (file.getName().startsWith(".")) {
43 continue;
44 }
45
46 if (!file.getName().endsWith(".naf.gz")) {
47 continue;
48 }
49
50
51 File outputFile = new File(file.getAbsoluteFile().toString().replace(inputFolder.getAbsolutePath(),outputFolder.getAbsolutePath()));
52
53 if (!outputFile.exists()) {
54
55 try (Reader reader = IO.utf8Reader(IO.buffer(IO.read(file.getAbsoluteFile().toString())))) {
56 try {
57
58
59
60 KAFDocument document = KAFDocument.createFromStream(reader);
61 reader.close();
62
63
64
65 for (removeLayer layer : removeLayer.values()) {
66 document.removeLayer(KAFDocument.Layer.valueOf(layer.toString()));
67 }
68
69 Files.createParentDirs(outputFile);
70 try (Writer w = IO.utf8Writer(IO.buffer(IO.write(outputFile.getAbsolutePath())))) {
71 w.write(document.toString());
72 w.close();
73
74
75 }
76
77 System.out.println("");
78
79 } catch (Exception e) {
80
81 }
82
83 }
84 }
85
86 }
87 } catch (FileNotFoundException e) {
88 e.printStackTrace();
89 } catch (IOException e) {
90 e.printStackTrace();
91 }
92 }
93 }