1   package eu.fbk.dkm.pikes.resources.conllAIDA;
2   
3   import com.google.common.io.Files;
4   import eu.fbk.rdfpro.util.IO;
5   import eu.fbk.utils.core.CommandLine;
6   import ixa.kaflib.Entity;
7   import ixa.kaflib.ExternalRef;
8   import ixa.kaflib.KAFDocument;
9   import ixa.kaflib.Term;
10  import org.apache.commons.lang.StringUtils;
11  import org.slf4j.LoggerFactory;
12  
13  import java.io.*;
14  import java.text.SimpleDateFormat;
15  import java.util.*;
16  import java.util.stream.Collectors;
17  
18  /**
19   * Created by marcorospocher on 19/07/16.
20   */
21  public class DistillEntities {
22  
23      private static final org.slf4j.Logger LOGGER = LoggerFactory.getLogger(DistillEntities.class);
24      private static SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd'T'HH:mm:ss.SSSZ");
25  
26  
27      public enum removeLayer {
28          deps, chunks, entities, properties, categories, coreferences, opinions, relations, srl, constituency, timeExpressions, linkedEntities, constituencyStrings;
29      }
30  
31      public static void main(String[] args) {
32          try {
33              final CommandLine cmd = CommandLine
34                      .parser()
35                      .withName("stripNAF")
36                      .withHeader("Strip NAF files of unnecessary layers")
37                      .withOption("i", "input-folder", "the folder of the input NAF corpus", "DIR", CommandLine.Type.DIRECTORY_EXISTING, true, false, true)
38                      .withOption("o", "output-folder", "the folder of the input NAF corpus", "DIR", CommandLine.Type.DIRECTORY, true, false, true)
39                      .withOption("m", "mode", "modality (1=entity-centric, 2=term-centric)", "INT", CommandLine.Type.NON_NEGATIVE_INTEGER, true, false, false)
40                      .withLogger(LoggerFactory.getLogger("eu.fbk")).parse(args);
41  
42              File inputFolder = cmd.getOptionValue("input-folder", File.class);
43              File outputFolder = cmd.getOptionValue("output-folder", File.class);
44              Integer modality = cmd.getOptionValue("mode", Integer.class,1);
45  //
46              for (final File file : Files.fileTreeTraverser().preOrderTraversal(inputFolder)) {
47                  if (!file.isFile()) {
48                      continue;
49                  }
50                  if (file.getName().startsWith(".")) {
51                      continue;
52                  }
53  
54                  if (!file.getName().endsWith(".naf.gz")) {
55                      continue;
56                  }
57  
58                  System.out.print("Processing: "+file.getAbsoluteFile().toString());
59  
60                  String inputfilenamelocal = file.getName();
61                  String outputfilenamelocal = StringUtils.leftPad(inputfilenamelocal.replace(".naf",".tsv").replace("conll-",""), 11, '0');
62                  //System.out.println(outputfilenamelocal);
63  
64                  String outputfilename = file.getAbsoluteFile().toString().replace(inputFolder.getAbsolutePath(),outputFolder.getAbsolutePath()).replace(inputfilenamelocal,outputfilenamelocal);
65  
66                  File outputFile = new File(outputfilename);
67  
68                  if (!outputFile.exists()) {
69  
70                      try (Reader reader = IO.utf8Reader(IO.buffer(IO.read(file.getAbsoluteFile().toString())))) {
71                          try {
72  
73                              //System.out.print(" WORKING");
74  
75                              KAFDocument document = KAFDocument.createFromStream(reader);
76                              reader.close();
77  
78                              HashMap<Integer,String> toPrint = new HashMap();
79                              String ID = document.getPublic().publicId;
80  
81                              if (modality==1) {
82  
83                                  //System.out.println("Processing: "+file.getAbsoluteFile().toString());
84                                  final List<Entity> entities = document.getEntities();
85  
86                                  for (Entity entity : entities
87                                          ) {
88                                      String type = entity.getType();
89                                      List<ExternalRef> externalRefs = entity.getExternalRefs();
90                                      HashMap<String, Float> refs = new HashMap();
91                                      for (ExternalRef exref : externalRefs
92                                              ) {
93                                          if (exref.getResource().equalsIgnoreCase("dbpedia-candidates")) {
94                                              String ref = exref.getReference();
95                                              float confidence = exref.getConfidence();
96                                              refs.put(ref, confidence);
97                                          } else refs.put("null", 0.0f);
98                                      }
99  
100                                     String span = entity.getStr();
101                                     String IDs = "";
102                                     Integer spanID = Integer.parseInt(entity.getTerms().get(0).getId().replace("t", "")) - 1;
103                                     for (Term t : entity.getTerms()
104                                             ) {
105                                         IDs += String.valueOf(Integer.parseInt(t.getId().replace("t", "")) - 1) + " ";
106                                     }
107                                     //System.out.println(span);
108                                     //System.out.print(type);
109                                     String references = refs.entrySet().stream()
110                                             .sorted(Map.Entry.<String, Float>comparingByValue().reversed()).map(e -> e.getKey().replace("http://dbpedia.org/resource/", "") + " [" + e.getValue() + "]").collect(Collectors.joining("  "));
111                                     String line = ID.replace("conll-", "") + "\t" + IDs + "\t" + span + "\t" + type + "\t" + references;
112                                     //System.out.println(line);
113                                     toPrint.put(spanID, line);
114                                 }
115                             } else if (modality==2) {
116 
117                                 final List<Term> terms = document.getTerms();
118                                 Collections.sort(terms, new Comparator<Term>() {
119                                     @Override
120                                     public int compare(Term o1, Term o2) {
121                                         return Integer.compare(o1.getOffset(),o2.getOffset());
122                                     }
123                                 });
124                                 for (Term t : terms
125                                         ) {
126 
127                                     Integer t_ID = Integer.parseInt(t.getId().replace("t",""))-1;
128 
129                                     String line = t.getForm() + "\t" + t_ID;
130 
131                                     //System.out.println(t.getStr());
132                                     List<Entity> entities = document.getEntitiesByTerm(t);
133 
134 
135 
136                                     if (entities.size()>0) {
137                                         Entity entity = entities.get(0);
138                                         String type = entity.getType();
139 
140                                         List<ExternalRef> externalRefs = entity.getExternalRefs();
141                                         HashMap<String, Float> refs = new HashMap();
142                                         for (ExternalRef exref : externalRefs
143                                                 ) {
144                                             if (exref.getResource().equalsIgnoreCase("dbpedia-candidates")) {
145                                                 String ref = exref.getReference();
146                                                 float confidence = exref.getConfidence();
147                                                 refs.put(ref, confidence);
148                                             } else refs.put("null", 0.0f);
149                                         }
150                                         String span = entity.getStr();
151                                         String IDs = "";
152                                         Integer spanID = Integer.parseInt(entity.getTerms().get(0).getId().replace("t", "")) - 1;
153                                         for (Term tt : entity.getTerms()
154                                                 ) {
155                                             IDs += String.valueOf(Integer.parseInt(t.getId().replace("t", "")) - 1) + " ";
156                                         }
157                                         //System.out.println(span);
158                                         //System.out.print(type);
159                                         String references = refs.entrySet().stream()
160                                                 .sorted(Map.Entry.<String, Float>comparingByValue().reversed()).map(e -> e.getKey().replace("http://dbpedia.org/resource/", "") + " [" + e.getValue() + "]").collect(Collectors.joining("  "));
161                                         //String line = ID.replace("conll-", "") + "\t" + IDs + "\t" + span + "\t" + type + "\t" + references;
162                                         line += "\t" + type + "\t" + references;
163                                         //System.out.println(line);
164 
165                                     }
166                                     toPrint.put(t_ID, line);
167                                 }
168 
169                             }
170 
171 
172 
173 
174                             Files.createParentDirs(outputFile);
175                             try (Writer w = IO.utf8Writer(IO.buffer(IO.write(outputFile.getAbsolutePath())))) {
176                                 toPrint.entrySet().stream()
177                                         .sorted(Map.Entry.<Integer, String>comparingByKey()).map(e -> e.getValue()).forEach(e -> {
178                                     try {
179                                         w.write(e+"\n");
180                                     } catch (IOException e1) {
181                                         e1.printStackTrace();
182                                     }
183                                 });
184 
185                                 w.close();
186                                 //System.out.print(" SAVED");
187 
188                             } catch (IOException ee) {
189                                 ee.printStackTrace();
190                             }
191 
192                             System.out.println(" DONE!");
193 
194 
195                         } catch (Exception e) {
196 
197                         }
198 
199                     }
200 
201 
202                 } //else System.out.println(" SKIPPED");
203 
204             }
205         } catch (FileNotFoundException e) {
206             e.printStackTrace();
207         } catch (IOException e) {
208             e.printStackTrace();
209         }
210     }
211 }