1   import ixa.kaflib.*;
2   import org.eclipse.rdf4j.query.algebra.Str;
3   import org.slf4j.LoggerFactory;
4   import org.w3c.dom.Document;
5   import org.w3c.dom.Element;
6   import org.w3c.dom.Node;
7   import org.w3c.dom.NodeList;
8   
9   import javax.xml.parsers.DocumentBuilder;
10  import javax.xml.parsers.DocumentBuilderFactory;
11  import java.io.File;
12  import java.io.IOException;
13  import java.util.ArrayList;
14  import java.util.HashMap;
15  import java.util.HashSet;
16  import java.util.List;
17  
18  /**
19   * Created by alessio on 06/11/15.
20   */
21  
22  public class MergeSemafor {
23  
24      private static final org.slf4j.Logger LOGGER = LoggerFactory.getLogger(MergeSemafor.class);
25  
26      public static void main(String[] args) {
27          String nafFolder = "/Users/alessio/Documents/semafor-sentences/naf";
28          String semFolder = "/Users/alessio/Documents/semafor-sentences/semafor";
29          String outFolder = "/Users/alessio/Documents/semafor-sentences/out";
30  
31          try {
32  
33              DocumentBuilderFactory dbFactory = DocumentBuilderFactory.newInstance();
34              DocumentBuilder dBuilder = dbFactory.newDocumentBuilder();
35  
36              File nafFolderFile = new File(nafFolder);
37              if (!nafFolderFile.exists()) {
38                  throw new IOException();
39              }
40              if (!nafFolderFile.isDirectory()) {
41                  throw new IOException();
42              }
43  
44              File[] listOfFiles = nafFolderFile.listFiles();
45  
46              for (int i = 0; i < listOfFiles.length; i++) {
47                  File file = listOfFiles[i];
48                  if (file.isFile()) {
49  
50                      System.out.println("File " + file.getName());
51                      KAFDocument document = KAFDocument.createFromFile(file);
52  
53                      File semaforFile = new File(semFolder + File.separator + file.getName().replaceAll("naf$", "xml"));
54                      if (!semaforFile.exists()) {
55                          LOGGER.error("Semafor file {} does not exist", semaforFile.getAbsolutePath());
56                          continue;
57                      }
58  
59                      Document doc = dBuilder.parse(semaforFile);
60                      doc.getDocumentElement().normalize();
61  
62                      NodeList nList;
63                      nList = doc.getElementsByTagName("sentences");
64                      int numSent = nList.getLength();
65  
66                      if (numSent != 1) {
67                          LOGGER.error("Wrong number of sentences: {}", numSent);
68                          continue;
69                      }
70  
71                      nList = doc.getElementsByTagName("annotationSet");
72                      for (int temp = 0; temp < nList.getLength(); temp++) {
73                          Node nNode = nList.item(temp);
74                          if (nNode.getNodeType() == Node.ELEMENT_NODE) {
75                              Element eElement = (Element) nNode;
76  
77                              String frameName = eElement.getAttribute("frameName");
78                              HashMap<String, List<Term>> roles = new HashMap<>();
79  
80                              NodeList labelList = eElement.getElementsByTagName("label");
81                              for (int j = 0; j < labelList.getLength(); j++) {
82                                  Node labelNode = labelList.item(j);
83                                  if (labelNode.getNodeType() == Node.ELEMENT_NODE) {
84                                      Element labelElement = (Element) labelNode;
85  
86                                      String name = labelElement.getAttribute("name");
87                                      String span = labelElement.getAttribute("span");
88                                      int start = Integer.parseInt(labelElement.getAttribute("start"));
89                                      int end = Integer.parseInt(labelElement.getAttribute("end"));
90  
91                                      String[] tokens = span.split("\\s+");
92                                      if (tokens.length == 0) {
93                                          LOGGER.error("Invalid tokens");
94                                          continue;
95                                      }
96  
97                                      List<Term> terms = document.getTerms();
98  
99                                      String firstToken = tokens[0];
100                                     String lastToken = tokens[tokens.length - 1];
101                                     Integer foundFirst = findToken(terms, firstToken, start);
102                                     Integer foundLast = findToken(terms, lastToken, end - lastToken.length() + 1);
103 
104                                     if (foundFirst == null || foundLast == null) {
105                                         LOGGER.error("Found is null");
106                                         continue;
107                                     }
108 
109                                     List<Term> okTerms = new ArrayList<>();
110                                     for (Term term : terms) {
111                                         if (term.getWFs().size() != 1) {
112                                             LOGGER.error("Wrong number of WF");
113                                             continue;
114                                         }
115 
116                                         for (WF wf : term.getWFs()) {
117                                             if (wf.getOffset() >= foundFirst && wf.getOffset() <= foundLast) {
118                                                 okTerms.add(term);
119                                             }
120                                         }
121                                     }
122 
123                                     roles.put(name, okTerms);
124                                 }
125                             }
126 
127                             if (!roles.containsKey("Target")) {
128                                 LOGGER.error("No Target");
129                                 continue;
130                             }
131 
132                             Span<Term> target = KAFDocument.newTermSpan(roles.get("Target"));
133 
134                             Predicate predicate = document.newPredicate(target);
135                             predicate.addExternalRef(document.createExternalRef("FrameNet", frameName));
136                             predicate.setId("f_" + predicate.getId());
137 
138                             for (String key : roles.keySet()) {
139                                 if (key.equals("Target")) {
140                                     continue;
141                                 }
142 
143                                 Span<Term> span = KAFDocument.newTermSpan(roles.get(key));
144                                 Predicate.Role role = document.newRole(predicate, key, span);
145                                 role.addExternalRef(document.createExternalRef("FrameNet", frameName + "@" + key));
146                                 predicate.addRole(role);
147                             }
148                         }
149                     }
150 
151                     String outFileName = outFolder + File.separator + file.getName();
152                     document.save(outFileName);
153                 }
154             }
155 
156         } catch (Exception e) {
157             System.err.println(e.getMessage());
158         }
159     }
160 
161     private static Integer findToken(List<Term> terms, String token, int start) {
162         HashMap<Integer, Term> okTerms = new HashMap<>();
163         for (Term term : terms) {
164             for (WF wf : term.getWFs()) {
165                 if (wf.getForm().trim().toLowerCase().equals(token.toLowerCase())) {
166                     okTerms.put(wf.getOffset(), term);
167                 }
168             }
169         }
170 
171         Integer found = null;
172         for (int k = 0; k < 5; k++) {
173             if (okTerms.containsKey(start - k)) {
174                 found = start - k;
175                 break;
176             }
177         }
178 
179         return found;
180     }
181 
182 }