1   package ixa.kaflib;
2   
3   import java.io.Serializable;
4   import java.util.*;
5   
6   /**  */
7   public class Tree implements Serializable { //?
8   
9       public static final String HEAD_MARK = "=H";
10  
11      /** Tree's root node */
12      private TreeNode root;
13  	private Integer sentence = null;
14  
15      Tree(TreeNode root) {
16  	this.root = root;
17      }
18  
19  	Tree(TreeNode root, Integer sentence) {
20  		this.root = root;
21  		this.sentence = sentence;
22  	}
23  
24  	public TreeNode getRoot() {
25  	return this.root;
26      }
27  
28      public void setRoot(TreeNode root) {
29  	this.root = root;
30      }
31  
32  
33      /***********************************************************/
34      /* Code for converting OpenNLP's parentheses output to NAF */
35      /***********************************************************/
36  
37  	static void parenthesesToKaf(String parOut, KAFDocument kaf) throws Exception {
38  		parenthesesToKaf(parOut, kaf, null);
39  	}
40  
41  	public Integer getSentence() {
42  		return sentence;
43  	}
44  
45  	static void parenthesesToKaf(String parOut, KAFDocument kaf, Integer sentence) throws Exception {
46  	String[] tokens = Tree.tokenize(parOut);
47  	Tree.check(tokens);
48          HashMap<Integer, Integer> parMatching = Tree.matchParentheses(tokens);
49  	HashMap<Integer, Term> termMatching = Tree.matchTerms(tokens, kaf.getSentenceTerms(sentence));
50  	// behin-behineko irtenbidea errorea ekiditeko: hutsa itzuli
51  	if (termMatching.size() == 0) {
52  	    return;
53  	}
54  	List<Tree> trees = new ArrayList<Tree>();
55  	int current = 0;
56  	while (current < tokens.length) {
57  	    int end = parMatching.get(current);
58  	    NonTerminal root = Tree.createNonTerminal(tokens, current+1, end-1, parMatching, termMatching, kaf);
59  	    kaf.newConstituent(root, sentence);
60  	    current = end + 1;
61  	}
62      }
63  
64      private static String[] tokenize(String parOut) {
65  	List<String> tokens = new ArrayList<String>();
66  	int current = 0;
67  	int length = parOut.length();
68  	String token = new String("");
69  	while (current < length) {
70  	    char nextChar = parOut.charAt(current++);
71  	    if (nextChar == '(') {
72  		if (!token.isEmpty()) {
73  		    tokens.add(token);
74  		}
75  		tokens.add(new String("("));
76  		token = new String("");
77  	    }
78  	    else if (nextChar == ')') {
79  		if (!token.isEmpty()) {
80  		    tokens.add(token);
81  		}
82  		tokens.add(new String(")"));
83  		token = new String("");
84  	    }
85  	    else if ((nextChar == ' ') || (nextChar == '\n')) {
86  		if (!token.isEmpty()) {
87  		    tokens.add(token);
88  		    token = new String();
89  		}
90  	    }
91  	    else {
92  		token += nextChar;
93  	    }
94  	}
95  	return tokens.toArray(new String[tokens.size()]);
96      }
97  
98      private static HashMap<Integer, Integer>  matchParentheses(String[] tokens) {
99  	HashMap<Integer, Integer> indexes = new HashMap<Integer, Integer>();
100 	Stack<Integer> stack = new Stack<Integer>();
101 	int ind = 0;
102 	for (String token : tokens) {
103 	    if (token.equals("(")) {
104 		stack.push(ind);
105 	    }
106 	    else if (token.equals(")")) {
107 		indexes.put(stack.pop(), ind);
108 	    }
109 	    ind++;
110 	}
111 	return indexes;
112     }
113 
114     private static HashMap<Integer, Term> matchTerms(String[] tokens, List<Term> terms) throws Exception {
115 	HashMap<Integer, Term> mapping = new HashMap<Integer, Term>();
116 	int nextTerm = 0;
117 	for (int i=1; i<tokens.length; i++) {
118 	    if ((!tokens[i].equals("(")) && (!tokens[i].equals(")"))) {
119 		if ((!tokens[i-1].equals("(")) && (!tokens[i-1].equals(")"))) {
120 		    String termForm = terms.get(nextTerm).getForm();
121 		    String previousTermForm = "";
122 		    if (nextTerm != 0) {
123 			previousTermForm = terms.get(nextTerm-1).getForm();
124 		    }
125 
126 		    if (termForm.equals("(")) {
127 			termForm = new String("-LRB-");
128 		    }
129 		    else if (termForm.equals(")")) {
130 			termForm = new String("-RRB-");
131 		    }
132 		    else if (termForm.equals("{")) { 
133 			termForm = new String("-LCB-");
134 		    }
135 		    else if (termForm.equals("}")) { 
136 			termForm = new String("-RCB-");
137 		    }
138 		    else if (termForm.equals("[")) {
139 			    termForm = new String("-LSB-");
140 		    }
141 		    else if (termForm.equals("]")) {
142 			    termForm = new String("-RSB-");
143 		    }
144 
145 		    
146 		    if (termForm.equals(tokens[i]) || termForm.contains(tokens[i])) {
147 			mapping.put(i, terms.get(nextTerm));			
148 			nextTerm++;
149 		    }
150 		    else if ((nextTerm > 0) && previousTermForm.contains(tokens[i])) {
151 			// The token is part of a multitoken
152 			mapping.put(i, terms.get(nextTerm-1));
153 			// Don't update nextTerm
154 		    }
155 		    else {
156 			boolean matched = false;
157 			nextTerm++;
158 			while (!matched && (nextTerm != terms.size())) {
159 			    if (terms.get(nextTerm).getForm().equals(tokens[i])) {
160 				mapping.put(i, terms.get(nextTerm));
161 				matched = true;
162 			    }
163 			    nextTerm++;
164 			}
165 			if (!matched) {
166 			    //throw new Exception("Can't perform parentheses=>NAF at constituency (tok_id: " + terms.get(nextTerm).getId()  + ", [" + termForm + "] != [" + tokens[i] + "])");
167 				throw new Exception("Can't perform parentheses=>NAF at constituency: form \"" + tokens[i] + "\" not found in the KAF document.");
168 			}
169 		    }
170 		}
171 	    }
172 	}
173 	return mapping;
174     }
175 
176     private static NonTerminal createNonTerminal(String[] tokens, int start, int end, HashMap<Integer, Integer> parenthesesMap, HashMap<Integer, Term> termMap, KAFDocument kaf) {
177 	String tag = tokens[start];
178 	boolean isHead = isHead(tag);
179 	if (isHead) {
180 	    tag = removeHeadMark(tag);
181 	}
182 	NonTerminal nt = kaf.newNonTerminal(tag);
183 	if (isHead) {
184 	    nt.setHead(true);
185 	}
186 	if (end - start == 1) {
187 	    Terminal t = Tree.createTerminal(tokens[end], termMap.get(end), kaf);
188 	    try {
189 		nt.addChild(t);
190 	    } catch(Exception e) {}
191 	}
192 	else {
193 	    int current = start + 1;
194 	    while (current <= end) {
195 		int subParEnd = parenthesesMap.get(current);
196 		NonTerminal nnt = Tree.createNonTerminal(tokens, current+1, subParEnd-1, parenthesesMap, termMap, kaf);
197 		try {
198 		    nt.addChild(nnt);
199 		} catch(Exception e) {}
200 		current = subParEnd + 1;
201 	    }
202 	}
203 	return nt;
204     }
205 
206     private static Terminal createTerminal(String token, Term term, KAFDocument kaf) {
207 	Span<Term> span = kaf.newTermSpan();
208 	span.addTarget(term);
209 	return kaf.newTerminal(span);
210     }
211 
212     private static void check(String[] tokens) throws Exception {
213 	int opened = 0;
214 	for (int i=0; i<tokens.length; i++) {
215 	    if (tokens[i].equals("(")) {
216 		if ((i>0) && (tokens[i-1].equals("("))) {
217 		    throw Tree.getException(tokens, i);
218 		}
219 		else if (i == tokens.length-1) {
220 		    throw Tree.getException(tokens, i);
221 		}
222 		opened++;
223 	    }
224 	    else if (tokens[i].equals(")")) {
225 		if ((i<3) || tokens[i-1].equals("(")) {
226 		    throw Tree.getException(tokens, i);
227 		}
228 		opened--;
229 	    }
230 	    else { // string token
231 		if ((i==0) || (i == tokens.length-1)) {
232 		    throw Tree.getException(tokens, i);
233 		}
234 		else if (isAWord(tokens[i-1]) && isAWord(tokens[i+1])) {
235 		    throw Tree.getException(tokens, i);
236 		}
237 		else if (tokens[i-1].equals(")")) {
238 		    throw Tree.getException(tokens, i);
239 		}
240 		else if (tokens[i-1].equals("(") && tokens[i+1].equals(")")) {
241 		    throw Tree.getException(tokens, i);
242 		}
243 	    }
244 	}
245 	if (opened != 0) {
246 	    throw Tree.getException(tokens, tokens.length-1);
247 	}
248     }
249 
250     private static boolean isAWord(String token) {
251 	return (!token.equals("(")) && (!token.equals(")"));
252     }
253 
254     private static Exception getException(String[] tokens, int ind) {
255 	String str = new String("Parentheses format not valid: \"... ");
256 	for (int i=(ind<5 ? 0 : ind-5); i<(ind>tokens.length-6 ? tokens.length-1 : ind+5); i++) {
257 	    if (i == ind) {
258 		str += "->";
259 	    }
260 	    str += tokens[i];
261 	    if (i == ind) {
262 		str += "<-";
263 	    }
264 	    str += " ";
265 	}
266 	return new Exception(str + " ...\"");
267     }
268 
269     private static boolean isHead(String tag) {
270 	return tag.endsWith(HEAD_MARK);
271     }
272 
273     private static String removeHeadMark(String tag) {
274 	if (!isHead(tag)) {
275 	    return tag;
276 	}
277 	return tag.substring(0, tag.length() - HEAD_MARK.length());
278     }
279 }