1   package eu.fbk.dkm.pikes.raid.mdfsa.parser;
2   
3   import com.hp.hpl.jena.rdf.model.*;
4   import edu.stanford.nlp.parser.lexparser.LexicalizedParser;
5   import edu.stanford.nlp.trees.Tree;
6   import eu.fbk.dkm.pikes.raid.mdfsa.FileManager;
7   import eu.fbk.dkm.pikes.raid.mdfsa.FileManager.Mode;
8   import eu.fbk.dkm.pikes.raid.mdfsa.MaxEntTagger;
9   import eu.fbk.shell.mdfsa.data.structures.SentenceStructuredRepresentation;
10  import eu.fbk.dkm.pikes.raid.mdfsa.wordnet.WordNetLexicalizer;
11  import eu.fbk.dkm.pikes.raid.mdfsa.wordnet.WordNetLoader;
12  import org.w3c.dom.Document;
13  
14  import java.util.ArrayList;
15  import java.util.Properties;
16  
17  public class ReviewsParser {
18  
19    private Properties prp;
20    private WordNetLoader wnl;
21    
22    public ReviewsParser(Properties prp, WordNetLoader wnl) {
23      this.prp = prp;
24      this.wnl = wnl;
25    }
26    
27    
28    /**
29     * Loads the current dataset name during the full simulation execution
30     * @param datasetName dataset name
31     */
32    public Document[] loadFull(String datasetName) {
33      String positiveReviews = ((String) this.prp.getProperty("mdfsa.dataset.basepath")).concat(datasetName + "/positive.review");
34      String negativeReviews = ((String) this.prp.getProperty("mdfsa.dataset.basepath")).concat(datasetName + "/negative.review");
35      String allReviews = ((String) this.prp.getProperty("mdfsa.dataset.basepath")).concat(datasetName + "/all.review");
36      return null;
37    }
38    
39    
40     
41    /**
42     * Loads a review-format file during a simple execution
43     */
44    public ArrayList<SentenceStructuredRepresentation> load(String filename) {
45      
46      WordNetLexicalizer wnlex = new WordNetLexicalizer(this.wnl.getAllTerms(), this.wnl.getAllExceptions());
47      ArrayList<SentenceStructuredRepresentation> ssrList = new ArrayList<SentenceStructuredRepresentation>();
48      LexicalizedParser treeParser;
49      DependenciesBuilder db = new DependenciesBuilder();
50      MaxEntTagger met = new MaxEntTagger(this.prp);
51      db.init();
52      FileManager fm = new FileManager(filename, Mode.READ);
53      Model content = fm.importRDFContent();
54      
55      String task = prp.getProperty("mdfsa.task");
56      
57      int reviewId = 1;
58      // Lists the statements in the Model
59      StmtIterator iter = content.listStatements();
60  
61      System.out.println(content.size());
62      int stmtID = 0;
63      // Prints out the predicate, subject, and object of each statement
64      while (iter.hasNext()) {
65        System.out.println("Loading sentence " + reviewId);
66        Statement stmt      = iter.nextStatement();  // get next statement
67        Resource  subject   = stmt.getSubject();     // get the subject
68        Property  predicate = stmt.getPredicate();   // get the predicate
69        RDFNode   object    = stmt.getObject();      // get the object
70        
71        /* Gets the review text */
72        int endText = object.toString().indexOf("^^");
73        String currentReviewOriginal = object.toString().substring(1, endText).replaceAll("\n", "");
74        String currentReview = currentReviewOriginal.replaceAll("\\.", " \\. ");
75        currentReview = currentReview.replaceAll("\\:", " \\: ");
76        currentReview = currentReview.replaceAll("\\,", " \\, ");
77        currentReview = currentReview.replaceAll("\\!", " \\! ");
78        currentReview = currentReview.replaceAll("\\?", " \\? ");
79        currentReview = currentReview.replaceAll("( )+", " ");
80        /*
81        currentReview = currentReview.replace(".", " . ");
82        currentReview = currentReview.replace("\"", " ");
83        currentReview = currentReview.replace("!", " ! ");
84        currentReview = currentReview.replace("?", " ? ");
85        currentReview = currentReview.replace(":", " : ");
86        currentReview = currentReview.replace(";", " ; ");
87        currentReview = currentReview.replace(",", " , ");
88        currentReview = currentReview.replace("(", " ");
89        currentReview = currentReview.replace(")", " ");
90        currentReview = currentReview.replace("[", " ");
91        currentReview = currentReview.replace("]", " ");
92        currentReview = currentReview.replace("\\", " ");
93        currentReview = currentReview.replace("$", " ");
94        currentReview = currentReview.replace("%", " ");
95        currentReview = currentReview.replace("=", " ");
96        currentReview = currentReview.replace("_", " ");
97        currentReview = currentReview.replace("+", " ");
98        currentReview = currentReview.replace("&", " ");
99        currentReview = currentReview.replace("^", " ");
100       currentReview = currentReview.replace("|", " ");
101       currentReview = currentReview.replace("@", " ");
102       currentReview = currentReview.replace("`", " ");
103       currentReview = currentReview.trim();
104       */
105       currentReview = currentReview.replaceAll("\t", "");
106       //System.out.println(currentReview);
107       //currentReview = "I gave up to go to supermarkets yesterday.";
108       
109       /* Tags the review with the part-of-speech tags */
110       String taggedReview = null;
111                 
112       /* Builds the dependenct tree of the text */
113       ArrayList<DependencyTree> curDt = null;
114   
115       /* Extracts the parser tree of the sentece */
116       ArrayList<Tree> parsedTree = null;
117       
118       //if(task.compareTo("AdvancedOne") == 0 || task.compareTo("AdvancedTwo") == 0) {
119       
120         /* Tags the review with the part-of-speech tags */
121         taggedReview = met.tag(currentReview);
122         taggedReview = taggedReview.replaceAll("/\\.", "/\\. ");
123           
124         /* Builds the dependenct tree of the text */
125         db.buildDependeciesTree(currentReview.toLowerCase());
126     
127         /* Extracts the parser tree of the sentece */
128         curDt = db.getDependencyTrees();
129         parsedTree = db.getParsedTrees();
130         
131       /*} else {
132         
133         /* Tags the review with the part-of-speech tags */
134         //taggedReview = null;
135                   
136         /* Builds the dependenct tree of the text */
137         //curDt = null;
138     
139         /* Extracts the parser tree of the sentece */
140         //parsedTree = null;
141       //}
142       
143       /* Creates and sets the sentence object */
144       SentenceStructuredRepresentation ssr = new SentenceStructuredRepresentation(this.prp);
145       ssr.setUri(subject.toString());
146       ssr.setOriginalText(currentReviewOriginal);
147       ssr.setPosTaggedString(taggedReview);
148       ssr.setDependencyTree(curDt);
149       ssr.setParsedTree(parsedTree);
150       //ssr.extractTree(parsedTree);
151       //System.out.println(parsedTree);
152       //System.exit(0);
153       
154       //if(task.compareTo("AdvancedOne") == 0 || task.compareTo("AdvancedTwo") == 0) {
155         ssr.createLexicalizedRepresentation(wnlex);
156         ssr.extractSemanticConcepts(this.wnl, wnlex);
157         ssr.extractAspects(this.wnl);
158       //}
159       
160       ssrList.add(ssr);
161       
162       /* Gets next text to analyze */
163       //startText = content.indexOf("<review_text", endText + 10);
164       //endText = content.indexOf("</review_text", startText + 10);
165       //System.out.println(reviewId + " - " + startText + " - " + endText);
166       reviewId++;
167     }
168     return ssrList;
169   }
170   
171   
172   
173   
174   /**
175    * Utility method that convert the blitzer review in the eswc2014 challenge format
176    */
177   public void convertReviewToESWCChallenge(String filename, String datasetName) {
178     FileManager fm = new FileManager(filename, Mode.READ);
179     //String content = fm.importFullTextContent();
180     ArrayList<String> contents = fm.importSimpleTextContent();
181     
182     FileManager rdfOut = new FileManager("/home/drago/Documents/java_projects/research/nlp/multi_domain_fuzzy_sentiment_analysis/eswc2014_challenge_mdfsa_dragoni/task3/" + datasetName + ".validation.rdf.xml", Mode.WRITE);
183     
184     rdfOut.write("<?xml version=\"1.0\" encoding=\"UTF-8\"?>");
185     rdfOut.write("<rdf:RDF xmlns:rdf=\"http://www.w3.org/1999/02/22-rdf-syntax-ns#\">");
186     int reviewId = 1;
187     //int startText = content.indexOf("<review_text");
188     //int endText = content.indexOf("</review_text", startText + 10);
189     //while (startText != -1) {
190     for(String currentReview: contents) {
191       /* Gets the review text */
192       //String currentReview = content.substring(startText + 14, endText - 1).replaceAll("\n", "");
193       currentReview = currentReview.replace("&", "&amp;");
194       //currentReview = "I gave up to go to supermarkets yesterday.";
195             
196       /* Write the review in the RDF format */
197       rdfOut.write("\t<rdf:Description rdf:about=\"http://sentic.net/challenge/sentence_" + reviewId + "\">");
198       //rdfOut.write("\t\t<sentence xmlns=\"http://sentic.net/challenge/\" rdf:resource=\"http://sentic.net/challenge/sentence_" + reviewId + "\">");
199       rdfOut.write("\t\t\t<text xmlns=\"http://sentic.net/challenge/\" rdf:datatype=\"http://www.w3.org/TR/rdf-text/\">");
200       //rdfOut.write("\t\t\t<![CDATA[" + currentReview + "]]>");
201       rdfOut.write("\t\t\t" + currentReview + "");
202       rdfOut.write("\t\t\t</text>");
203       //rdfOut.write("\t\t</sentence>");
204       rdfOut.write("\t</rdf:Description>");
205       
206       
207       /* Gets next text to analyze */
208       //startText = content.indexOf("<review_text", endText + 10);
209       //endText = content.indexOf("</review_text", startText + 10);
210       //System.out.println(reviewId + " - " + startText + " - " + endText);
211       reviewId++;
212     }
213     rdfOut.write("</rdf:RDF>");
214     rdfOut.close();
215     fm.close();
216   }
217   
218   
219 }