1   package eu.fbk.dkm.pikes.resources;
2   
3   import com.google.common.base.Objects;
4   import com.google.common.base.Preconditions;
5   import com.google.common.collect.ImmutableList;
6   import com.google.common.collect.ImmutableMap;
7   import com.google.common.collect.Maps;
8   import eu.fbk.utils.core.CommandLine;
9   import eu.fbk.rdfpro.util.Environment;
10  import eu.fbk.rdfpro.util.IO;
11  import org.slf4j.Logger;
12  import org.slf4j.LoggerFactory;
13  
14  import javax.annotation.Nullable;
15  import java.io.BufferedReader;
16  import java.io.File;
17  import java.io.IOException;
18  import java.util.Map;
19  
20  public final class SubjectivityLexicon extends Lexicon<SubjectivityLexicon.Lexeme> {
21  
22      private static final Logger LOGGER = LoggerFactory.getLogger(SubjectivityLexicon.class);
23  
24      private static SubjectivityLexicon instance = null;
25  
26      public static synchronized void setInstance(@Nullable final SubjectivityLexicon instance) {
27          SubjectivityLexicon.instance = instance;
28      }
29  
30      public static synchronized SubjectivityLexicon getInstance() {
31          if (instance == null) {
32              final String location = Objects.firstNonNull(
33                      Environment.getProperty("subjectivity.lexicon.home"),
34                      "SubjectivityLexicon.tsv");
35              try {
36                  instance = Lexicon.readFrom(SubjectivityLexicon.class, Lexeme.class, location);
37              } catch (final Throwable ex) {
38                  throw new Error("Could not read default subjectivity lexicon at " + location, ex);
39              }
40          }
41          return instance;
42      }
43  
44      public static SubjectivityLexicon index(final String resourceFile) throws IOException {
45  
46          final Map<String, Lexeme> lexemes = Maps.newHashMap();
47          try (BufferedReader reader = new BufferedReader(IO.utf8Reader(IO.buffer(IO
48                  .read(resourceFile))))) {
49  
50              String line;
51              while ((line = reader.readLine()) != null) {
52  
53                  String word = null;
54                  String pos = null;
55                  Polarity polarity = null;
56                  boolean stemmed = false;
57                  boolean strong = false;
58  
59                  for (final String token : line.split("\\s+")) {
60                      final int index = token.indexOf('=');
61                      if (index < 0) {
62                          LOGGER.warn("Could not parse token '" + token + "'");
63                          continue;
64                      }
65                      final String key = token.substring(0, index).trim();
66                      final String value = token.substring(index + 1).trim();
67                      if (key.equals("type")) {
68                          strong = value.toLowerCase().contains("strong");
69                      } else if (key.equals("word1")) {
70                          word = value;
71                      } else if (key.equals("pos1")) {
72                          final String posValue = value.toLowerCase();
73                          if (posValue.equals("adj")) {
74                              pos = "G";
75                          } else if (posValue.equals("adverb")) {
76                              pos = "A";
77                          } else if (posValue.equals("noun")) {
78                              pos = "N";
79                          } else if (posValue.equals("verb")) {
80                              pos = "V";
81                          } else {
82                              pos = null;
83                          }
84                      } else if (key.equals("stemmed1")) {
85                          stemmed = value.equalsIgnoreCase("y");
86                      } else if (key.equals("priorpolarity")) {
87                          // There is a single value 'weakneg' that we normalize to 'negative'
88                          polarity = value.equalsIgnoreCase("weakneg") ? Polarity.NEGATIVE
89                                  : Polarity.valueOf(value.toUpperCase());
90                      }
91                  }
92  
93                  if (word == null || polarity == null) {
94                      LOGGER.warn("Could not parse line (ignoring it):\n" + line);
95                  } else {
96                      final String lemma = stemmed ? null : word;
97                      final String stem = stemmed ? Stemming.stem(null, word) : null;
98                      final Token token = Token.create(lemma, stem, pos);
99                      final String id = word + (stemmed ? "_stemmed" : "")
100                             + (pos == null ? "" : "_" + pos.toLowerCase());
101                     final Lexeme lexeme = new Lexeme(id, ImmutableList.of(token), polarity, strong);
102                     final Lexeme oldLexeme = lexemes.put(id, lexeme);
103                     if (oldLexeme != null) {
104                         if (lexeme.getTokens().equals(oldLexeme.getTokens())
105                                 && lexeme.getPolarity().equals(oldLexeme.getPolarity())
106                                 && lexeme.isStrong() == oldLexeme.isStrong()) {
107                             LOGGER.debug("Ignoring duplicate lexeme:\n  " + oldLexeme);
108                         } else {
109                             LOGGER.warn("Found conflicting lexemes (first one selected):\n  (1) "
110                                     + lexeme + "\n  (2) " + oldLexeme);
111                         }
112                     }
113                 }
114             }
115         }
116 
117         return new SubjectivityLexicon(lexemes.values());
118     }
119 
120     public static void main(final String... args) {
121         try {
122             final CommandLine cmd = CommandLine
123                     .parser()
124                     .withName("index-subjectivity-lexicon")
125                     .withHeader("Processes the original file of the subjectivity lexicon, " //
126                             + "producing a TSV file with an indexed version of it that can " //
127                             + "be used with the eu.fbk.dkm.pikes.resources.SubjectivityLexicon Java API class.")
128                     .withOption("i", "input", "the input file name", "FILE", CommandLine.Type.FILE_EXISTING,
129                             true, false, true)
130                     .withOption("o", "output", "the output file name", "FILE", CommandLine.Type.FILE, true,
131                             false, true) //
132                     .withLogger(LoggerFactory.getLogger("eu.fbk")) //
133                     .parse(args);
134 
135             final File inputFile = cmd.getOptionValue("i", File.class);
136             final File outputFile = cmd.getOptionValue("o", File.class);
137 
138             final SubjectivityLexicon lexicon = index(inputFile.getAbsolutePath());
139             lexicon.writeTo(outputFile.getAbsolutePath());
140 
141         } catch (final Throwable ex) {
142             CommandLine.fail(ex);
143         }
144     }
145 
146     public SubjectivityLexicon(final Iterable<Lexeme> lexemes) {
147         super(lexemes);
148     }
149 
150     public static final class Lexeme extends Lexicon.Lexeme {
151 
152         private final Polarity polarity;
153 
154         private final boolean strong;
155 
156         public Lexeme(final String id, final Iterable<Token> tokens, final Polarity polarity,
157                 final boolean strong) {
158             super(id, tokens);
159             this.polarity = Preconditions.checkNotNull(polarity);
160             this.strong = strong;
161         }
162 
163         protected Lexeme(final String id, final Iterable<Token> tokens,
164                 final Map<String, String> properties) {
165             // for use with reflection
166             this(id, tokens, Polarity.valueOf(properties.get("polarity").toUpperCase()), Boolean
167                     .valueOf(properties.get("strong").toLowerCase()));
168         }
169 
170         @Override
171         protected Map<String, String> getProperties() {
172             return ImmutableMap.of("polarity", this.polarity.toString(), "strong",
173                     Boolean.toString(this.strong));
174         }
175 
176         public Polarity getPolarity() {
177             return this.polarity;
178         }
179 
180         public boolean isStrong() {
181             return this.strong;
182         }
183 
184     }
185 
186     public enum Polarity {
187 
188         NEUTRAL,
189 
190         POSITIVE,
191 
192         NEGATIVE,
193 
194         BOTH
195 
196     }
197 
198 }