1 package eu.fbk.dkm.pikes.raid.mdfsa.parser;
2
3 import edu.stanford.nlp.trees.Tree;
4 import eu.fbk.dkm.pikes.raid.mdfsa.FileManager;
5 import eu.fbk.dkm.pikes.raid.mdfsa.FileManager.Mode;
6 import eu.fbk.dkm.pikes.raid.mdfsa.MaxEntTagger;
7 import eu.fbk.shell.mdfsa.data.structures.SentenceStructuredRepresentation;
8 import eu.fbk.dkm.pikes.raid.mdfsa.wordnet.WordNetLexicalizer;
9 import eu.fbk.dkm.pikes.raid.mdfsa.wordnet.WordNetLoader;
10
11 import java.util.ArrayList;
12 import java.util.Properties;
13
14 public class DatasetInstanceParser {
15
16 private Properties prp;
17 private WordNetLoader wnl;
18 private WordNetLexicalizer wnlex;
19 private DependenciesBuilder db;
20 private MaxEntTagger met;
21
22 public DatasetInstanceParser(Properties prp, WordNetLoader wnl) {
23 this.prp = prp;
24 this.wnl = wnl;
25 if(this.wnl != null) {
26 this.wnlex = new WordNetLexicalizer(wnl.getAllTerms(), this.wnl.getAllExceptions());
27 }
28 this.db = new DependenciesBuilder();
29 this.met = new MaxEntTagger(this.prp);
30 this.db.init();
31 }
32
33
34
35
36
37
38 public SentenceStructuredRepresentation createSentenceStructuredRepresentation(String originalText, String id) {
39
40
41
42
43 String text = originalText.replaceAll("\\.", " \\. ");
44 text = text.replaceAll("\\:", " \\: ");
45 text = text.replaceAll("\\,", " \\, ");
46 text = text.replaceAll("\\!", " \\! ");
47 text = text.replaceAll("\\?", " \\? ");
48 text = text.replaceAll("( )+", " ");
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76 text = text.replaceAll("\t", "");
77
78
79
80 String taggedReview = null;
81
82
83 ArrayList<DependencyTree> curDt = null;
84
85
86 ArrayList<Tree> parsedTree = null;
87
88
89 taggedReview = this.met.tag(text);
90 taggedReview = taggedReview.replaceAll("/\\.", "/\\. ");
91
92
93 this.db.buildDependeciesTree(text.toLowerCase());
94 curDt = this.db.getDependencyTrees();
95
96
97 parsedTree = this.db.getParsedTrees();
98
99
100
101 SentenceStructuredRepresentation ssr = new SentenceStructuredRepresentation(this.prp);
102 ssr.setUri(id);
103 ssr.setOriginalText(originalText);
104 ssr.setPosTaggedString(taggedReview);
105 ssr.setDependencyTree(curDt);
106 ssr.setParsedTree(parsedTree);
107
108
109 if(this.wnlex != null) {
110 ssr.createLexicalizedRepresentation(this.wnlex);
111 ssr.extractSemanticConcepts(this.wnl, this.wnlex);
112 ssr.extractAspects(this.wnl);
113 }
114
115
116 return ssr;
117 }
118
119
120
121
122
123
124
125 public void convertReviewToESWCChallenge(String filename, String datasetName) {
126 FileManager fm = new FileManager(filename, Mode.READ);
127
128 ArrayList<String> contents = fm.importSimpleTextContent();
129
130 FileManager rdfOut = new FileManager("/home/drago/Documents/java_projects/research/nlp/multi_domain_fuzzy_sentiment_analysis/eswc2014_challenge_mdfsa_dragoni/task3/" + datasetName + ".validation.rdf.xml", Mode.WRITE);
131
132 rdfOut.write("<?xml version=\"1.0\" encoding=\"UTF-8\"?>");
133 rdfOut.write("<rdf:RDF xmlns:rdf=\"http://www.w3.org/1999/02/22-rdf-syntax-ns#\">");
134 int reviewId = 1;
135
136
137
138 for(String currentReview: contents) {
139
140
141 currentReview = currentReview.replace("&", "&");
142
143
144
145 rdfOut.write("\t<rdf:Description rdf:about=\"http://sentic.net/challenge/sentence_" + reviewId + "\">");
146
147 rdfOut.write("\t\t\t<text xmlns=\"http://sentic.net/challenge/\" rdf:datatype=\"http://www.w3.org/TR/rdf-text/\">");
148
149 rdfOut.write("\t\t\t" + currentReview + "");
150 rdfOut.write("\t\t\t</text>");
151
152 rdfOut.write("\t</rdf:Description>");
153
154
155
156
157
158
159 reviewId++;
160 }
161 rdfOut.write("</rdf:RDF>");
162 rdfOut.close();
163 fm.close();
164 }
165
166
167 }