1   package eu.fbk.dkm.pikes.raid.mdfsa.parser;
2   
3   import edu.stanford.nlp.trees.Tree;
4   import eu.fbk.dkm.pikes.raid.mdfsa.FileManager;
5   import eu.fbk.dkm.pikes.raid.mdfsa.FileManager.Mode;
6   import eu.fbk.dkm.pikes.raid.mdfsa.MaxEntTagger;
7   import eu.fbk.shell.mdfsa.data.structures.SentenceStructuredRepresentation;
8   import eu.fbk.dkm.pikes.raid.mdfsa.wordnet.WordNetLexicalizer;
9   import eu.fbk.dkm.pikes.raid.mdfsa.wordnet.WordNetLoader;
10  
11  import java.util.ArrayList;
12  import java.util.Properties;
13  
14  public class DatasetInstanceParser {
15  
16    private Properties prp;
17    private WordNetLoader wnl;
18    private WordNetLexicalizer wnlex;
19    private DependenciesBuilder db;
20    private MaxEntTagger met;
21    
22    public DatasetInstanceParser(Properties prp, WordNetLoader wnl) {
23      this.prp = prp;
24      this.wnl = wnl;
25      if(this.wnl != null) {
26        this.wnlex = new WordNetLexicalizer(wnl.getAllTerms(), this.wnl.getAllExceptions());
27     }
28     this.db = new DependenciesBuilder();
29     this.met = new MaxEntTagger(this.prp);
30     this.db.init();
31    }
32    
33    
34      
35    /**
36     * Loads a review-format file during a simple execution
37     */
38    public SentenceStructuredRepresentation createSentenceStructuredRepresentation(String originalText, String id) {
39        
40      //System.out.println("Loading sentence " + originalText);
41            
42      /* Gets the review text */
43      String text = originalText.replaceAll("\\.", " \\. ");
44      text = text.replaceAll("\\:", " \\: ");
45      text = text.replaceAll("\\,", " \\, ");
46      text = text.replaceAll("\\!", " \\! ");
47      text = text.replaceAll("\\?", " \\? ");
48      text = text.replaceAll("( )+", " ");
49      
50      /*
51      currentReview = currentReview.replace(".", " . ");
52      currentReview = currentReview.replace("\"", " ");
53      currentReview = currentReview.replace("!", " ! ");
54      currentReview = currentReview.replace("?", " ? ");
55      currentReview = currentReview.replace(":", " : ");
56      currentReview = currentReview.replace(";", " ; ");
57      currentReview = currentReview.replace(",", " , ");
58      currentReview = currentReview.replace("(", " ");
59      currentReview = currentReview.replace(")", " ");
60      currentReview = currentReview.replace("[", " ");
61      currentReview = currentReview.replace("]", " ");
62      currentReview = currentReview.replace("\\", " ");
63      currentReview = currentReview.replace("$", " ");
64      currentReview = currentReview.replace("%", " ");
65      currentReview = currentReview.replace("=", " ");
66      currentReview = currentReview.replace("_", " ");
67      currentReview = currentReview.replace("+", " ");
68      currentReview = currentReview.replace("&", " ");
69      currentReview = currentReview.replace("^", " ");
70      currentReview = currentReview.replace("|", " ");
71      currentReview = currentReview.replace("@", " ");
72      currentReview = currentReview.replace("`", " ");
73      currentReview = currentReview.trim();
74      */
75      
76      text = text.replaceAll("\t", "");
77  
78      
79      /* Tags the review with the part-of-speech tags */
80      String taggedReview = null;
81                
82      /* Builds the dependent tree of the text */
83      ArrayList<DependencyTree> curDt = null;
84  
85      /* Extracts the parser tree of the sentence */
86      ArrayList<Tree> parsedTree = null;
87        
88      /* Tags the review with the part-of-speech tags */
89      taggedReview = this.met.tag(text);
90      taggedReview = taggedReview.replaceAll("/\\.", "/\\. ");
91          
92      /* Builds the dependent tree of the text */
93      this.db.buildDependeciesTree(text.toLowerCase());
94      curDt = this.db.getDependencyTrees();
95    
96      /* Extracts the parser tree of the sentence */
97      parsedTree = this.db.getParsedTrees();
98       
99      
100     /* Creates and sets the sentence object */
101     SentenceStructuredRepresentation ssr = new SentenceStructuredRepresentation(this.prp);
102     ssr.setUri(id);
103     ssr.setOriginalText(originalText);
104     ssr.setPosTaggedString(taggedReview);
105     ssr.setDependencyTree(curDt);
106     ssr.setParsedTree(parsedTree);
107     
108     
109     if(this.wnlex != null) {
110       ssr.createLexicalizedRepresentation(this.wnlex);
111       ssr.extractSemanticConcepts(this.wnl, this.wnlex);
112       ssr.extractAspects(this.wnl);
113     }
114     
115    
116     return ssr;
117   }
118   
119   
120   
121   
122   /**
123    * Utility method that convert the blitzer review in the eswc2014 challenge format
124    */
125   public void convertReviewToESWCChallenge(String filename, String datasetName) {
126     FileManager fm = new FileManager(filename, Mode.READ);
127     //String content = fm.importFullTextContent();
128     ArrayList<String> contents = fm.importSimpleTextContent();
129     
130     FileManager rdfOut = new FileManager("/home/drago/Documents/java_projects/research/nlp/multi_domain_fuzzy_sentiment_analysis/eswc2014_challenge_mdfsa_dragoni/task3/" + datasetName + ".validation.rdf.xml", Mode.WRITE);
131     
132     rdfOut.write("<?xml version=\"1.0\" encoding=\"UTF-8\"?>");
133     rdfOut.write("<rdf:RDF xmlns:rdf=\"http://www.w3.org/1999/02/22-rdf-syntax-ns#\">");
134     int reviewId = 1;
135     //int startText = content.indexOf("<review_text");
136     //int endText = content.indexOf("</review_text", startText + 10);
137     //while (startText != -1) {
138     for(String currentReview: contents) {
139       /* Gets the review text */
140       //String currentReview = content.substring(startText + 14, endText - 1).replaceAll("\n", "");
141       currentReview = currentReview.replace("&", "&amp;");
142       //currentReview = "I gave up to go to supermarkets yesterday.";
143             
144       /* Write the review in the RDF format */
145       rdfOut.write("\t<rdf:Description rdf:about=\"http://sentic.net/challenge/sentence_" + reviewId + "\">");
146       //rdfOut.write("\t\t<sentence xmlns=\"http://sentic.net/challenge/\" rdf:resource=\"http://sentic.net/challenge/sentence_" + reviewId + "\">");
147       rdfOut.write("\t\t\t<text xmlns=\"http://sentic.net/challenge/\" rdf:datatype=\"http://www.w3.org/TR/rdf-text/\">");
148       //rdfOut.write("\t\t\t<![CDATA[" + currentReview + "]]>");
149       rdfOut.write("\t\t\t" + currentReview + "");
150       rdfOut.write("\t\t\t</text>");
151       //rdfOut.write("\t\t</sentence>");
152       rdfOut.write("\t</rdf:Description>");
153       
154       
155       /* Gets next text to analyze */
156       //startText = content.indexOf("<review_text", endText + 10);
157       //endText = content.indexOf("</review_text", startText + 10);
158       //System.out.println(reviewId + " - " + startText + " - " + endText);
159       reviewId++;
160     }
161     rdfOut.write("</rdf:RDF>");
162     rdfOut.close();
163     fm.close();
164   }
165   
166   
167 }