1 package eu.fbk.shell.mdfsa.data.structures;
2
3 import edu.stanford.nlp.trees.Tree;
4 import eu.fbk.dkm.pikes.raid.mdfsa.parser.DependencyTree;
5 import eu.fbk.dkm.pikes.raid.mdfsa.wordnet.WordNetLexicalizer;
6 import eu.fbk.dkm.pikes.raid.mdfsa.wordnet.WordNetLoader;
7
8 import java.io.Serializable;
9 import java.util.ArrayList;
10 import java.util.HashMap;
11 import java.util.Iterator;
12 import java.util.Properties;
13
14 public class SentenceStructuredRepresentation implements Serializable {
15
16 private static final long serialVersionUID = 1L;
17
18 private Properties prp;
19 private String uri;
20 private String originalText;
21 private String posTaggedString;
22 private String lexString;
23 private String stemmedString;
24 private ArrayList<Tree> parsedTree;
25 private HashMap<String, ArrayList<String>> aspects;
26 private ArrayList<String> semanticConcepts;
27 private ArrayList<DependencyTree> dts;
28 private int sentenceMarker;
29
30 public SentenceStructuredRepresentation(Properties prp) {
31 this.prp = prp;
32 }
33
34 public void setUri(String uri) {
35 this.uri = uri;
36 }
37
38 public String getUri() {
39 return this.uri;
40 }
41
42 public String getOriginalText() {
43 return this.originalText;
44 }
45
46 public void setOriginalText(String originalText) {
47 this.originalText = originalText;
48 }
49
50 public String getPosTaggedString() {
51 return this.posTaggedString;
52 }
53
54 public void setPosTaggedString(String posTaggedString) {
55 this.posTaggedString = posTaggedString;
56 }
57
58 public ArrayList<DependencyTree> getDependencyTree() {
59 return this.dts;
60 }
61
62 public void setDependencyTree(ArrayList<DependencyTree> dts) {
63 this.dts = dts;
64 }
65
66 public HashMap<String, ArrayList<String>> getAspects() {
67 return this.aspects;
68 }
69
70 public ArrayList<String> getSemanticConcepts() {
71 return this.semanticConcepts;
72 }
73
74 public ArrayList<Tree> getParsedTree() {
75 return this.parsedTree;
76 }
77
78 public void setParsedTree(ArrayList<Tree> parsedTree) {
79 this.parsedTree = parsedTree;
80 }
81
82 public void createLexicalizedRepresentation(WordNetLexicalizer wnlex) {
83
84
85 String[] posTaggedTerms = this.posTaggedString.split(" ");
86 StringBuffer tempLex = new StringBuffer();
87 for(String curTaggedTerm: posTaggedTerms) {
88 if(curTaggedTerm.compareTo("") == 0) {
89 continue;
90 }
91 try {
92 String term = curTaggedTerm.substring(0, curTaggedTerm.indexOf("/"));
93 String tag = curTaggedTerm.substring(curTaggedTerm.indexOf("/") + 1);
94
95 if(tag.compareTo("NNS") == 0 || tag.compareTo("NNPS") == 0) {
96 term = wnlex.getWordLexicalizationByType(term, "N");
97 } else if(tag.compareTo("VBD") == 0 || tag.compareTo("VBG") == 0 || tag.compareTo("VBN") == 0 ||
98 tag.compareTo("VBP") == 0 || tag.compareTo("VBZ") == 0) {
99 term = wnlex.getWordLexicalizationByType(term, "V");
100 } else if(tag.compareTo("JJR") == 0 || tag.compareTo("JJS") == 0) {
101 term = wnlex.getWordLexicalizationByType(term, "AJ");
102 } else if(tag.compareTo("RBR") == 0 || tag.compareTo("RBS") == 0) {
103 term = wnlex.getWordLexicalizationByType(term, "AV");
104 }
105 if(term == null) {
106 term = curTaggedTerm.substring(0, curTaggedTerm.indexOf("/"));
107 }
108 tempLex.append(term + "/" + tag + " ");
109 } catch(Exception e) {
110
111 e.printStackTrace();
112
113 }
114 }
115 this.lexString = tempLex.toString().trim();
116
117
118
119
120 }
121
122
123 public void createStemmedRepresentation() {
124
125 }
126
127
128
129
130
131 public void extractSemanticConcepts(WordNetLoader wnl, WordNetLexicalizer wnlex) {
132 this.semanticConcepts = new ArrayList<String>();
133
134
135 String[] termsList = this.lexString.split(" ");
136 boolean compoundNounFlag = false;
137 for(String currentTerm : termsList) {
138 String[] atom = currentTerm.split("/");
139 if(atom.length > 1) {
140 Integer stopFlag = wnl.getStopwords().get(atom[0]);
141 if(stopFlag != null) continue;
142 if(atom[1].compareTo("NN") == 0 || atom[1].compareTo("NNP") == 0 || atom[1].compareTo("NNPS") == 0 ||
143 atom[1].compareTo("NNS") == 0 || atom[1].compareTo("FW") == 0) {
144 String newAspect;
145 if(compoundNounFlag == true) {
146 newAspect = this.semanticConcepts.get(this.semanticConcepts.size() - 1);
147 newAspect = (newAspect + " " + atom[0]).replaceAll(" ", "_").toLowerCase();
148 this.semanticConcepts.remove(this.semanticConcepts.size() - 1);
149 } else {
150 newAspect = atom[0].replaceAll(" ", "_").toLowerCase();
151 }
152 if(!this.semanticConcepts.contains(newAspect)) {
153 this.semanticConcepts.add(newAspect);
154 }
155 compoundNounFlag = true;
156 } else {
157 compoundNounFlag = false;
158 }
159 }
160 }
161
162
163
164 for(DependencyTree dt: this.dts)
165 {
166 ArrayList<String> dependencies = dt.getDependecies();
167 for(String curDep: dependencies) {
168 String[] tokens = curDep.split("\\^\\^\\^");
169 if(tokens.length == 3) {
170 if(tokens[0].trim().compareTo("dobj") == 0) {
171 String[] tokenOne = tokens[1].split("-");
172 String[] tokenTwo = tokens[2].split("-");
173 String partOne = wnlex.getWordLexicalizationByType(tokenOne[0], "V");
174 if(partOne == null) {
175 partOne = tokenOne[0];
176 }
177 String partTwo = wnlex.getWordLexicalizationByType(tokenTwo[0], "N");
178 if(partTwo == null) {
179 partTwo = tokenTwo[0];
180 }
181 String newAspect = partOne + "_" + partTwo;
182 if(!this.semanticConcepts.contains(newAspect)) {
183 this.semanticConcepts.add(newAspect);
184 }
185 }
186 }
187 }
188 }
189
190 }
191
192
193
194
195
196 public void extractAspects(WordNetLoader wnl) {
197 ArrayList<String> tempAspects = new ArrayList<String>();
198 this.aspects = new HashMap<String, ArrayList<String>>();
199
200
201
202
203 String[] termsList = this.lexString.split(" ");
204 boolean compoundNounFlag = false;
205 for(String currentTerm : termsList) {
206 String[] atom = currentTerm.split("/");
207 if(atom.length > 1) {
208 Integer stopFlag = wnl.getStopwords().get(atom[0]);
209 if(stopFlag != null) continue;
210 if(atom[1].compareTo("NN") == 0 || atom[1].compareTo("NNP") == 0 || atom[1].compareTo("NNPS") == 0 ||
211 atom[1].compareTo("NNS") == 0 || atom[1].compareTo("FW") == 0) {
212 String newAspect;
213 if(compoundNounFlag == true) {
214 newAspect = tempAspects.get(tempAspects.size() - 1);
215 newAspect = (newAspect + " " + atom[0]).replaceAll(" ", "_").toLowerCase();
216 tempAspects.remove(tempAspects.size() - 1);
217 } else {
218 newAspect = atom[0].replaceAll(" ", "_").toLowerCase();
219 }
220 if(!tempAspects.contains(newAspect)) {
221 tempAspects.add(newAspect);
222 }
223 compoundNounFlag = true;
224 } else {
225 compoundNounFlag = false;
226 }
227 }
228 }
229
230
231
232
233 HashMap<Integer, ArrayList<String>> featureSentence = new HashMap<Integer, ArrayList<String>>();
234 for(Tree pt: this.parsedTree)
235 {
236 this.sentenceMarker = 0;
237 this.extractRelatedFeatures(pt, this.sentenceMarker, featureSentence);
238 for(String curAspect: tempAspects) {
239 String[] compoundAspect = curAspect.split(" ");
240
241 for(String cA: compoundAspect) {
242 Iterator<Integer> it = featureSentence.keySet().iterator();
243 while(it.hasNext()) {
244 int key = it.next();
245 ArrayList<String> currentTree = featureSentence.get(key);
246 if(currentTree.contains(cA)) {
247 ArrayList<String> featuresList = this.aspects.get(curAspect);
248 if(featuresList == null) {
249 featuresList = new ArrayList<String>();
250 }
251 for(String currentFeature: currentTree) {
252 if(currentFeature.compareTo(cA) != 0) {
253 featuresList.add(currentFeature);
254 }
255 }
256 this.aspects.put(curAspect, featuresList);
257 }
258 }
259 }
260 }
261 }
262 }
263
264
265
266 public ArrayList<Tree> extractTree(Tree t) {
267 ArrayList<Tree> wanted = new ArrayList<Tree>();
268 if (t.label().value().equals("S") || t.label().value().equals("SBAR")) {
269 wanted.add(t);
270 for (Tree child : t.children()) {
271 ArrayList<Tree> temp = new ArrayList<Tree>();
272 temp = this.extractTree(child);
273 if (temp.size() > 0) {
274 int o = -1;
275 o = wanted.indexOf(t);
276 if (o != -1) {
277 wanted.remove(o);
278 }
279 }
280 wanted.addAll(temp);
281 }
282 } else {
283 for (Tree child : t.children()) {
284 wanted.addAll(this.extractTree(child));
285 }
286 }
287
288
289
290 return wanted;
291 }
292
293
294
295 private void extractRelatedFeatures(Tree t, int marker, HashMap<Integer, ArrayList<String>> featureSentence) {
296
297 int localmarker = marker;
298 if (t.label().value().equals("S") || t.label().value().equals("SBAR")) {
299 localmarker = this.sentenceMarker + 1;
300 this.sentenceMarker = localmarker;
301 marker++;
302 }
303 if (t.label().value().length() > 1 && t.label().value().equals(t.label().value().toLowerCase())) {
304 ArrayList<String> currentFeatures = featureSentence.get(marker);
305 if(currentFeatures == null) {
306 currentFeatures = new ArrayList<String>();
307 }
308 currentFeatures.add(t.label().value());
309
310 featureSentence.put(marker, currentFeatures);
311 }
312 for (Tree child: t.children()) {
313 HashMap<String, Integer> temp = new HashMap<String, Integer>();
314 this.extractRelatedFeatures(child, localmarker, featureSentence);
315 if (temp.size() > 0) {
316 Iterator<String> it = temp.keySet().iterator();
317 while(it.hasNext()) {
318 String currentFeature = (String) it.next();
319
320 }
321 }
322 }
323 if(localmarker == 1) {
324 marker--;
325 }
326
327 }
328
329
330 }