1   package eu.fbk.dkm.pikes.resources.conllAIDA;
2   
3   import eu.fbk.utils.core.CommandLine;
4   import ixa.kaflib.KAFDocument;
5   import org.apache.commons.lang.StringUtils;
6   import org.json.simple.JSONObject;
7   import org.json.simple.parser.JSONParser;
8   import org.slf4j.LoggerFactory;
9   
10  import java.io.File;
11  import java.io.IOException;
12  import java.nio.file.Files;
13  import java.nio.file.Paths;
14  import java.text.SimpleDateFormat;
15  import java.util.ArrayList;
16  import java.util.Date;
17  import java.util.List;
18  import java.util.stream.Collectors;
19  import java.util.stream.Stream;
20  
21  /**
22   * Created by marcorospocher on 12/05/16.
23   * DEPRECATED!! Better use ConvertDocsFromAIDAGS
24   */
25  public class ConvertDocsFromGS {
26  
27      private static SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd'T'HH:mm:ss.SSSZ");
28      private static String DEFAULT_URL = "http://pikes.fbk.eu/conll/";
29  
30      public static void main(String[] args) throws Exception {
31  
32  
33          final CommandLine cmd = CommandLine
34                  .parser()
35                  .withName("ConvertDocsFromGS")
36                  .withHeader("Generates < YAGO entity, rdf:type , NER type> triples")
37                  .withOption("c", "conll", "CONLL folder", "FOLDER", CommandLine.Type.DIRECTORY, true, false, true)
38                  .withOption("o", "output", "Output file", "FOLDER", CommandLine.Type.DIRECTORY, true,
39                          false, true)
40                  .withOption("u", "url-template", "URL template (with %d for the ID)", "URL",
41                          CommandLine.Type.STRING, true, false, false)
42                  .withLogger(LoggerFactory.getLogger("eu.fbk")) //
43                  .parse(args);
44  
45          File conllfolder = cmd.getOptionValue("conll", File.class);
46          File outputfile = cmd.getOptionValue("output", File.class);
47  
48          List<String> conll_list = new ArrayList<>();
49  
50          try (Stream<String> stream = Files.lines(Paths.get(conllfolder.toString()+"/eng.train"))) {
51  
52              conll_list.addAll(stream
53   //                   .filter(line -> !line.startsWith("-DOCSTART-"))
54   //                   .filter(line -> !line.isEmpty())
55                      .collect(Collectors.toList()));
56  
57          } catch (IOException e) {
58              e.printStackTrace();
59          }
60  
61          //added as missing starting DOCSTART
62          conll_list.add("-DOCSTART- -X- O O");
63  
64          try (Stream<String> stream = Files.lines(Paths.get(conllfolder.toString()+"/eng.testa"))) {
65  
66  
67              conll_list.addAll(stream
68   //                   .filter(line -> !line.startsWith("-DOCSTART-"))
69   //                   .filter(line -> !line.isEmpty())
70                      .collect(Collectors.toList()));
71  
72          } catch (IOException e) {
73              e.printStackTrace();
74          }
75  
76          //added as missing starting DOCSTART
77          conll_list.add("-DOCSTART- -X- O O");
78  
79          try (Stream<String> stream = Files.lines(Paths.get(conllfolder.toString()+"/eng.testb"))) {
80              conll_list.addAll(stream
81   //                   .filter(line -> !line.startsWith("-DOCSTART-"))
82   //                   .filter(line -> !line.isEmpty())
83                      .collect(Collectors.toList()));
84  
85          } catch (IOException e) {
86              e.printStackTrace();
87          }
88  
89          Integer ID=1;
90          conll_list.remove(0);
91          conll_list.add("-DOCSTART-"); //to ease processing
92  
93          String text="";
94  
95          for (String line : conll_list
96                  ) {
97  
98              if (line.startsWith("-DOCSTART-")) {
99  
100                 if (!text.isEmpty()) {
101                     File outputFile = new File(outputfile.getAbsoluteFile().toString() + "/" + StringUtils.leftPad(ID.toString(),4,"0") + ".naf");
102 
103                     //File outputFile = new File(outputFileName);
104                     outputFile.getParentFile().mkdirs();
105                     KAFDocument document = new KAFDocument("en", "v3");
106 
107                     document.save(outputFile.getAbsolutePath());
108 
109                     document.setRawText(text);
110 
111                     KAFDocument.FileDesc fileDesc = document.createFileDesc();
112                     fileDesc.title = ID.toString();
113 
114                     Date thisDate = new Date();
115 
116                     fileDesc.creationtime = sdf.format(thisDate);
117                     String URL_str = ID.toString();
118                     fileDesc.filename = URL_str;
119 
120                     String urlTemplate = DEFAULT_URL;
121                     if (cmd.hasOption("url-template")) {
122                         urlTemplate = cmd.getOptionValue("url-template", String.class);
123                     }
124 
125                     KAFDocument.Public aPublic = document.createPublic();
126                     //aPublic.uri = URL_str;
127                     aPublic.uri = urlTemplate + ID.toString();
128                     aPublic.publicId = ID.toString();
129 
130                     document.save(outputFile.getAbsolutePath());
131                     text="";
132                     ID++;
133                 }
134 
135             } else if (line.isEmpty()) text+="\n";
136             else {
137                 String[] conll_item = line.split(" ");
138                 text+=conll_item[0]+" ";
139             }
140 
141         }
142     }
143 
144 }