1 package ixa.kaflib;
2
3 import java.io.Serializable;
4 import java.util.*;
5
6
7 public class Tree implements Serializable {
8
9 public static final String HEAD_MARK = "=H";
10
11
12 private TreeNode root;
13 private Integer sentence = null;
14
15 Tree(TreeNode root) {
16 this.root = root;
17 }
18
19 Tree(TreeNode root, Integer sentence) {
20 this.root = root;
21 this.sentence = sentence;
22 }
23
24 public TreeNode getRoot() {
25 return this.root;
26 }
27
28 public void setRoot(TreeNode root) {
29 this.root = root;
30 }
31
32
33
34
35
36
37 static void parenthesesToKaf(String parOut, KAFDocument kaf) throws Exception {
38 parenthesesToKaf(parOut, kaf, null);
39 }
40
41 public Integer getSentence() {
42 return sentence;
43 }
44
45 static void parenthesesToKaf(String parOut, KAFDocument kaf, Integer sentence) throws Exception {
46 String[] tokens = Tree.tokenize(parOut);
47 Tree.check(tokens);
48 HashMap<Integer, Integer> parMatching = Tree.matchParentheses(tokens);
49 HashMap<Integer, Term> termMatching = Tree.matchTerms(tokens, kaf.getSentenceTerms(sentence));
50
51 if (termMatching.size() == 0) {
52 return;
53 }
54 List<Tree> trees = new ArrayList<Tree>();
55 int current = 0;
56 while (current < tokens.length) {
57 int end = parMatching.get(current);
58 NonTerminal root = Tree.createNonTerminal(tokens, current+1, end-1, parMatching, termMatching, kaf);
59 kaf.newConstituent(root, sentence);
60 current = end + 1;
61 }
62 }
63
64 private static String[] tokenize(String parOut) {
65 List<String> tokens = new ArrayList<String>();
66 int current = 0;
67 int length = parOut.length();
68 String token = new String("");
69 while (current < length) {
70 char nextChar = parOut.charAt(current++);
71 if (nextChar == '(') {
72 if (!token.isEmpty()) {
73 tokens.add(token);
74 }
75 tokens.add(new String("("));
76 token = new String("");
77 }
78 else if (nextChar == ')') {
79 if (!token.isEmpty()) {
80 tokens.add(token);
81 }
82 tokens.add(new String(")"));
83 token = new String("");
84 }
85 else if ((nextChar == ' ') || (nextChar == '\n')) {
86 if (!token.isEmpty()) {
87 tokens.add(token);
88 token = new String();
89 }
90 }
91 else {
92 token += nextChar;
93 }
94 }
95 return tokens.toArray(new String[tokens.size()]);
96 }
97
98 private static HashMap<Integer, Integer> matchParentheses(String[] tokens) {
99 HashMap<Integer, Integer> indexes = new HashMap<Integer, Integer>();
100 Stack<Integer> stack = new Stack<Integer>();
101 int ind = 0;
102 for (String token : tokens) {
103 if (token.equals("(")) {
104 stack.push(ind);
105 }
106 else if (token.equals(")")) {
107 indexes.put(stack.pop(), ind);
108 }
109 ind++;
110 }
111 return indexes;
112 }
113
114 private static HashMap<Integer, Term> matchTerms(String[] tokens, List<Term> terms) throws Exception {
115 HashMap<Integer, Term> mapping = new HashMap<Integer, Term>();
116 int nextTerm = 0;
117 for (int i=1; i<tokens.length; i++) {
118 if ((!tokens[i].equals("(")) && (!tokens[i].equals(")"))) {
119 if ((!tokens[i-1].equals("(")) && (!tokens[i-1].equals(")"))) {
120 String termForm = terms.get(nextTerm).getForm();
121 String previousTermForm = "";
122 if (nextTerm != 0) {
123 previousTermForm = terms.get(nextTerm-1).getForm();
124 }
125
126 if (termForm.equals("(")) {
127 termForm = new String("-LRB-");
128 }
129 else if (termForm.equals(")")) {
130 termForm = new String("-RRB-");
131 }
132 else if (termForm.equals("{")) {
133 termForm = new String("-LCB-");
134 }
135 else if (termForm.equals("}")) {
136 termForm = new String("-RCB-");
137 }
138 else if (termForm.equals("[")) {
139 termForm = new String("-LSB-");
140 }
141 else if (termForm.equals("]")) {
142 termForm = new String("-RSB-");
143 }
144
145
146 if (termForm.equals(tokens[i]) || termForm.contains(tokens[i])) {
147 mapping.put(i, terms.get(nextTerm));
148 nextTerm++;
149 }
150 else if ((nextTerm > 0) && previousTermForm.contains(tokens[i])) {
151
152 mapping.put(i, terms.get(nextTerm-1));
153
154 }
155 else {
156 boolean matched = false;
157 nextTerm++;
158 while (!matched && (nextTerm != terms.size())) {
159 if (terms.get(nextTerm).getForm().equals(tokens[i])) {
160 mapping.put(i, terms.get(nextTerm));
161 matched = true;
162 }
163 nextTerm++;
164 }
165 if (!matched) {
166
167 throw new Exception("Can't perform parentheses=>NAF at constituency: form \"" + tokens[i] + "\" not found in the KAF document.");
168 }
169 }
170 }
171 }
172 }
173 return mapping;
174 }
175
176 private static NonTerminal createNonTerminal(String[] tokens, int start, int end, HashMap<Integer, Integer> parenthesesMap, HashMap<Integer, Term> termMap, KAFDocument kaf) {
177 String tag = tokens[start];
178 boolean isHead = isHead(tag);
179 if (isHead) {
180 tag = removeHeadMark(tag);
181 }
182 NonTerminal nt = kaf.newNonTerminal(tag);
183 if (isHead) {
184 nt.setHead(true);
185 }
186 if (end - start == 1) {
187 Terminal t = Tree.createTerminal(tokens[end], termMap.get(end), kaf);
188 try {
189 nt.addChild(t);
190 } catch(Exception e) {}
191 }
192 else {
193 int current = start + 1;
194 while (current <= end) {
195 int subParEnd = parenthesesMap.get(current);
196 NonTerminal nnt = Tree.createNonTerminal(tokens, current+1, subParEnd-1, parenthesesMap, termMap, kaf);
197 try {
198 nt.addChild(nnt);
199 } catch(Exception e) {}
200 current = subParEnd + 1;
201 }
202 }
203 return nt;
204 }
205
206 private static Terminal createTerminal(String token, Term term, KAFDocument kaf) {
207 Span<Term> span = kaf.newTermSpan();
208 span.addTarget(term);
209 return kaf.newTerminal(span);
210 }
211
212 private static void check(String[] tokens) throws Exception {
213 int opened = 0;
214 for (int i=0; i<tokens.length; i++) {
215 if (tokens[i].equals("(")) {
216 if ((i>0) && (tokens[i-1].equals("("))) {
217 throw Tree.getException(tokens, i);
218 }
219 else if (i == tokens.length-1) {
220 throw Tree.getException(tokens, i);
221 }
222 opened++;
223 }
224 else if (tokens[i].equals(")")) {
225 if ((i<3) || tokens[i-1].equals("(")) {
226 throw Tree.getException(tokens, i);
227 }
228 opened--;
229 }
230 else {
231 if ((i==0) || (i == tokens.length-1)) {
232 throw Tree.getException(tokens, i);
233 }
234 else if (isAWord(tokens[i-1]) && isAWord(tokens[i+1])) {
235 throw Tree.getException(tokens, i);
236 }
237 else if (tokens[i-1].equals(")")) {
238 throw Tree.getException(tokens, i);
239 }
240 else if (tokens[i-1].equals("(") && tokens[i+1].equals(")")) {
241 throw Tree.getException(tokens, i);
242 }
243 }
244 }
245 if (opened != 0) {
246 throw Tree.getException(tokens, tokens.length-1);
247 }
248 }
249
250 private static boolean isAWord(String token) {
251 return (!token.equals("(")) && (!token.equals(")"));
252 }
253
254 private static Exception getException(String[] tokens, int ind) {
255 String str = new String("Parentheses format not valid: \"... ");
256 for (int i=(ind<5 ? 0 : ind-5); i<(ind>tokens.length-6 ? tokens.length-1 : ind+5); i++) {
257 if (i == ind) {
258 str += "->";
259 }
260 str += tokens[i];
261 if (i == ind) {
262 str += "<-";
263 }
264 str += " ";
265 }
266 return new Exception(str + " ...\"");
267 }
268
269 private static boolean isHead(String tag) {
270 return tag.endsWith(HEAD_MARK);
271 }
272
273 private static String removeHeadMark(String tag) {
274 if (!isHead(tag)) {
275 return tag;
276 }
277 return tag.substring(0, tag.length() - HEAD_MARK.length());
278 }
279 }