1 import ixa.kaflib.*;
2 import org.eclipse.rdf4j.query.algebra.Str;
3 import org.slf4j.LoggerFactory;
4 import org.w3c.dom.Document;
5 import org.w3c.dom.Element;
6 import org.w3c.dom.Node;
7 import org.w3c.dom.NodeList;
8
9 import javax.xml.parsers.DocumentBuilder;
10 import javax.xml.parsers.DocumentBuilderFactory;
11 import java.io.File;
12 import java.io.IOException;
13 import java.util.ArrayList;
14 import java.util.HashMap;
15 import java.util.HashSet;
16 import java.util.List;
17
18
19
20
21
22 public class MergeSemafor {
23
24 private static final org.slf4j.Logger LOGGER = LoggerFactory.getLogger(MergeSemafor.class);
25
26 public static void main(String[] args) {
27 String nafFolder = "/Users/alessio/Documents/semafor-sentences/naf";
28 String semFolder = "/Users/alessio/Documents/semafor-sentences/semafor";
29 String outFolder = "/Users/alessio/Documents/semafor-sentences/out";
30
31 try {
32
33 DocumentBuilderFactory dbFactory = DocumentBuilderFactory.newInstance();
34 DocumentBuilder dBuilder = dbFactory.newDocumentBuilder();
35
36 File nafFolderFile = new File(nafFolder);
37 if (!nafFolderFile.exists()) {
38 throw new IOException();
39 }
40 if (!nafFolderFile.isDirectory()) {
41 throw new IOException();
42 }
43
44 File[] listOfFiles = nafFolderFile.listFiles();
45
46 for (int i = 0; i < listOfFiles.length; i++) {
47 File file = listOfFiles[i];
48 if (file.isFile()) {
49
50 System.out.println("File " + file.getName());
51 KAFDocument document = KAFDocument.createFromFile(file);
52
53 File semaforFile = new File(semFolder + File.separator + file.getName().replaceAll("naf$", "xml"));
54 if (!semaforFile.exists()) {
55 LOGGER.error("Semafor file {} does not exist", semaforFile.getAbsolutePath());
56 continue;
57 }
58
59 Document doc = dBuilder.parse(semaforFile);
60 doc.getDocumentElement().normalize();
61
62 NodeList nList;
63 nList = doc.getElementsByTagName("sentences");
64 int numSent = nList.getLength();
65
66 if (numSent != 1) {
67 LOGGER.error("Wrong number of sentences: {}", numSent);
68 continue;
69 }
70
71 nList = doc.getElementsByTagName("annotationSet");
72 for (int temp = 0; temp < nList.getLength(); temp++) {
73 Node nNode = nList.item(temp);
74 if (nNode.getNodeType() == Node.ELEMENT_NODE) {
75 Element eElement = (Element) nNode;
76
77 String frameName = eElement.getAttribute("frameName");
78 HashMap<String, List<Term>> roles = new HashMap<>();
79
80 NodeList labelList = eElement.getElementsByTagName("label");
81 for (int j = 0; j < labelList.getLength(); j++) {
82 Node labelNode = labelList.item(j);
83 if (labelNode.getNodeType() == Node.ELEMENT_NODE) {
84 Element labelElement = (Element) labelNode;
85
86 String name = labelElement.getAttribute("name");
87 String span = labelElement.getAttribute("span");
88 int start = Integer.parseInt(labelElement.getAttribute("start"));
89 int end = Integer.parseInt(labelElement.getAttribute("end"));
90
91 String[] tokens = span.split("\\s+");
92 if (tokens.length == 0) {
93 LOGGER.error("Invalid tokens");
94 continue;
95 }
96
97 List<Term> terms = document.getTerms();
98
99 String firstToken = tokens[0];
100 String lastToken = tokens[tokens.length - 1];
101 Integer foundFirst = findToken(terms, firstToken, start);
102 Integer foundLast = findToken(terms, lastToken, end - lastToken.length() + 1);
103
104 if (foundFirst == null || foundLast == null) {
105 LOGGER.error("Found is null");
106 continue;
107 }
108
109 List<Term> okTerms = new ArrayList<>();
110 for (Term term : terms) {
111 if (term.getWFs().size() != 1) {
112 LOGGER.error("Wrong number of WF");
113 continue;
114 }
115
116 for (WF wf : term.getWFs()) {
117 if (wf.getOffset() >= foundFirst && wf.getOffset() <= foundLast) {
118 okTerms.add(term);
119 }
120 }
121 }
122
123 roles.put(name, okTerms);
124 }
125 }
126
127 if (!roles.containsKey("Target")) {
128 LOGGER.error("No Target");
129 continue;
130 }
131
132 Span<Term> target = KAFDocument.newTermSpan(roles.get("Target"));
133
134 Predicate predicate = document.newPredicate(target);
135 predicate.addExternalRef(document.createExternalRef("FrameNet", frameName));
136 predicate.setId("f_" + predicate.getId());
137
138 for (String key : roles.keySet()) {
139 if (key.equals("Target")) {
140 continue;
141 }
142
143 Span<Term> span = KAFDocument.newTermSpan(roles.get(key));
144 Predicate.Role role = document.newRole(predicate, key, span);
145 role.addExternalRef(document.createExternalRef("FrameNet", frameName + "@" + key));
146 predicate.addRole(role);
147 }
148 }
149 }
150
151 String outFileName = outFolder + File.separator + file.getName();
152 document.save(outFileName);
153 }
154 }
155
156 } catch (Exception e) {
157 System.err.println(e.getMessage());
158 }
159 }
160
161 private static Integer findToken(List<Term> terms, String token, int start) {
162 HashMap<Integer, Term> okTerms = new HashMap<>();
163 for (Term term : terms) {
164 for (WF wf : term.getWFs()) {
165 if (wf.getForm().trim().toLowerCase().equals(token.toLowerCase())) {
166 okTerms.put(wf.getOffset(), term);
167 }
168 }
169 }
170
171 Integer found = null;
172 for (int k = 0; k < 5; k++) {
173 if (okTerms.containsKey(start - k)) {
174 found = start - k;
175 break;
176 }
177 }
178
179 return found;
180 }
181
182 }