1   package eu.fbk.dkm.pikes.resources.conllAIDA;
2   
3   import eu.fbk.utils.core.CommandLine;
4   import ixa.kaflib.KAFDocument;
5   import org.apache.commons.lang.StringUtils;
6   import org.slf4j.LoggerFactory;
7   
8   import java.io.File;
9   import java.io.IOException;
10  import java.nio.file.Files;
11  import java.nio.file.Paths;
12  import java.text.SimpleDateFormat;
13  import java.util.ArrayList;
14  import java.util.Date;
15  import java.util.List;
16  import java.util.stream.Collectors;
17  import java.util.stream.Stream;
18  
19  /**
20   * Created by marcorospocher on 12/05/16.
21   */
22  public class ConvertDocsFromAIDAGS {
23  
24      private static SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd'T'HH:mm:ss.SSSZ");
25      private static String DEFAULT_URL = "http://pikes.fbk.eu/conll/";
26  
27      public static void main(String[] args) throws Exception {
28  
29  
30          final CommandLine cmd = CommandLine
31                  .parser()
32                  .withName("ConvertDocsFromAIDAGS")
33                  .withHeader("Generates < YAGO entity, rdf:type , NER type> triples")
34                  .withOption("a", "aida", "AIDA-YAGO2-dataset.tsv", "FILE", CommandLine.Type.FILE, true, false, true)
35                  .withOption("o", "output", "Output file", "FOLDER", CommandLine.Type.DIRECTORY, true,
36                          false, true)
37                  .withOption("u", "url-template", "URL template (with %d for the ID)", "URL",
38                          CommandLine.Type.STRING, true, false, false)
39                  .withLogger(LoggerFactory.getLogger("eu.fbk")) //
40                  .parse(args);
41  
42          File aidagold = cmd.getOptionValue("aida", File.class);
43          File outputfile = cmd.getOptionValue("output", File.class);
44  
45          List<String> conll_list = new ArrayList<>();
46  
47          try (Stream<String> stream = Files.lines(Paths.get(aidagold.toString()))) {
48  
49              conll_list.addAll(stream
50                      //                   .filter(line -> !line.startsWith("-DOCSTART-"))
51                      //                   .filter(line -> !line.isEmpty())
52                      .collect(Collectors.toList()));
53  
54          } catch (IOException e) {
55              e.printStackTrace();
56          }
57  
58  //        Integer ID=1;
59  
60          String IDstr = getID(conll_list.get(0));
61          conll_list.remove(0);
62          conll_list.add("-DOCSTART-"); //to ease processing
63  
64          String text="";
65  
66          for (String line : conll_list
67                  ) {
68  
69              if (line.startsWith("-DOCSTART-")) {
70  
71  
72  
73                  if (!text.isEmpty()) {
74  
75                      Integer ID=Integer.parseInt(IDstr.split(" ")[0].replace("testa","").replace("testb",""));
76  
77                      File outputFile = new File(outputfile.getAbsoluteFile().toString() + "/" + StringUtils.leftPad(ID.toString(),4,"0") + ".naf");
78  
79                      //File outputFile = new File(outputFileName);
80                      outputFile.getParentFile().mkdirs();
81                      KAFDocument document = new KAFDocument("en", "v3");
82  
83                      document.save(outputFile.getAbsolutePath());
84  
85                      document.setRawText(text);
86  
87                      KAFDocument.FileDesc fileDesc = document.createFileDesc();
88                      fileDesc.title = ID.toString();
89  
90                      Date thisDate = new Date();
91  
92                      fileDesc.creationtime = sdf.format(thisDate);
93                      String URL_str = ID.toString();
94                      fileDesc.filename = URL_str;
95  
96                      String urlTemplate = DEFAULT_URL;
97                      if (cmd.hasOption("url-template")) {
98                          urlTemplate = cmd.getOptionValue("url-template", String.class);
99                      }
100 
101                     KAFDocument.Public aPublic = document.createPublic();
102                     //aPublic.uri = URL_str;
103                     aPublic.uri = urlTemplate + ID.toString();
104                     aPublic.publicId = IDstr;
105 
106                     //set public ID so it works for AIDA evaluator
107 
108 
109                     document.save(outputFile.getAbsolutePath());
110                     text="";
111 //                    ID++;
112 //                    oldID=newID;
113                 }
114 
115                 if (!line.equals("-DOCSTART-")) IDstr=getID(line);
116 
117             } else if (line.isEmpty()) text+="\n";
118             else {
119                 String[] conll_item = line.split("\t");
120                 text+=conll_item[0]+" ";
121             }
122 
123         }
124 
125 
126     }
127 
128     private static String getID (String line){
129 
130 //        System.out.println("writing file "+line);
131         return line.substring(line.indexOf("(")+1,line.indexOf(")"));
132 
133     }
134 
135 }