1 package eu.fbk.dkm.pikes.resources;
2
3 import com.google.common.base.Objects;
4 import com.google.common.base.Preconditions;
5 import com.google.common.collect.ImmutableList;
6 import com.google.common.collect.ImmutableMap;
7 import com.google.common.collect.Maps;
8 import eu.fbk.utils.core.CommandLine;
9 import eu.fbk.rdfpro.util.Environment;
10 import eu.fbk.rdfpro.util.IO;
11 import org.slf4j.Logger;
12 import org.slf4j.LoggerFactory;
13
14 import javax.annotation.Nullable;
15 import java.io.BufferedReader;
16 import java.io.File;
17 import java.io.IOException;
18 import java.util.Map;
19
20 public final class SubjectivityLexicon extends Lexicon<SubjectivityLexicon.Lexeme> {
21
22 private static final Logger LOGGER = LoggerFactory.getLogger(SubjectivityLexicon.class);
23
24 private static SubjectivityLexicon instance = null;
25
26 public static synchronized void setInstance(@Nullable final SubjectivityLexicon instance) {
27 SubjectivityLexicon.instance = instance;
28 }
29
30 public static synchronized SubjectivityLexicon getInstance() {
31 if (instance == null) {
32 final String location = Objects.firstNonNull(
33 Environment.getProperty("subjectivity.lexicon.home"),
34 "SubjectivityLexicon.tsv");
35 try {
36 instance = Lexicon.readFrom(SubjectivityLexicon.class, Lexeme.class, location);
37 } catch (final Throwable ex) {
38 throw new Error("Could not read default subjectivity lexicon at " + location, ex);
39 }
40 }
41 return instance;
42 }
43
44 public static SubjectivityLexicon index(final String resourceFile) throws IOException {
45
46 final Map<String, Lexeme> lexemes = Maps.newHashMap();
47 try (BufferedReader reader = new BufferedReader(IO.utf8Reader(IO.buffer(IO
48 .read(resourceFile))))) {
49
50 String line;
51 while ((line = reader.readLine()) != null) {
52
53 String word = null;
54 String pos = null;
55 Polarity polarity = null;
56 boolean stemmed = false;
57 boolean strong = false;
58
59 for (final String token : line.split("\\s+")) {
60 final int index = token.indexOf('=');
61 if (index < 0) {
62 LOGGER.warn("Could not parse token '" + token + "'");
63 continue;
64 }
65 final String key = token.substring(0, index).trim();
66 final String value = token.substring(index + 1).trim();
67 if (key.equals("type")) {
68 strong = value.toLowerCase().contains("strong");
69 } else if (key.equals("word1")) {
70 word = value;
71 } else if (key.equals("pos1")) {
72 final String posValue = value.toLowerCase();
73 if (posValue.equals("adj")) {
74 pos = "G";
75 } else if (posValue.equals("adverb")) {
76 pos = "A";
77 } else if (posValue.equals("noun")) {
78 pos = "N";
79 } else if (posValue.equals("verb")) {
80 pos = "V";
81 } else {
82 pos = null;
83 }
84 } else if (key.equals("stemmed1")) {
85 stemmed = value.equalsIgnoreCase("y");
86 } else if (key.equals("priorpolarity")) {
87
88 polarity = value.equalsIgnoreCase("weakneg") ? Polarity.NEGATIVE
89 : Polarity.valueOf(value.toUpperCase());
90 }
91 }
92
93 if (word == null || polarity == null) {
94 LOGGER.warn("Could not parse line (ignoring it):\n" + line);
95 } else {
96 final String lemma = stemmed ? null : word;
97 final String stem = stemmed ? Stemming.stem(null, word) : null;
98 final Token token = Token.create(lemma, stem, pos);
99 final String id = word + (stemmed ? "_stemmed" : "")
100 + (pos == null ? "" : "_" + pos.toLowerCase());
101 final Lexeme lexeme = new Lexeme(id, ImmutableList.of(token), polarity, strong);
102 final Lexeme oldLexeme = lexemes.put(id, lexeme);
103 if (oldLexeme != null) {
104 if (lexeme.getTokens().equals(oldLexeme.getTokens())
105 && lexeme.getPolarity().equals(oldLexeme.getPolarity())
106 && lexeme.isStrong() == oldLexeme.isStrong()) {
107 LOGGER.debug("Ignoring duplicate lexeme:\n " + oldLexeme);
108 } else {
109 LOGGER.warn("Found conflicting lexemes (first one selected):\n (1) "
110 + lexeme + "\n (2) " + oldLexeme);
111 }
112 }
113 }
114 }
115 }
116
117 return new SubjectivityLexicon(lexemes.values());
118 }
119
120 public static void main(final String... args) {
121 try {
122 final CommandLine cmd = CommandLine
123 .parser()
124 .withName("index-subjectivity-lexicon")
125 .withHeader("Processes the original file of the subjectivity lexicon, "
126 + "producing a TSV file with an indexed version of it that can "
127 + "be used with the eu.fbk.dkm.pikes.resources.SubjectivityLexicon Java API class.")
128 .withOption("i", "input", "the input file name", "FILE", CommandLine.Type.FILE_EXISTING,
129 true, false, true)
130 .withOption("o", "output", "the output file name", "FILE", CommandLine.Type.FILE, true,
131 false, true)
132 .withLogger(LoggerFactory.getLogger("eu.fbk"))
133 .parse(args);
134
135 final File inputFile = cmd.getOptionValue("i", File.class);
136 final File outputFile = cmd.getOptionValue("o", File.class);
137
138 final SubjectivityLexicon lexicon = index(inputFile.getAbsolutePath());
139 lexicon.writeTo(outputFile.getAbsolutePath());
140
141 } catch (final Throwable ex) {
142 CommandLine.fail(ex);
143 }
144 }
145
146 public SubjectivityLexicon(final Iterable<Lexeme> lexemes) {
147 super(lexemes);
148 }
149
150 public static final class Lexeme extends Lexicon.Lexeme {
151
152 private final Polarity polarity;
153
154 private final boolean strong;
155
156 public Lexeme(final String id, final Iterable<Token> tokens, final Polarity polarity,
157 final boolean strong) {
158 super(id, tokens);
159 this.polarity = Preconditions.checkNotNull(polarity);
160 this.strong = strong;
161 }
162
163 protected Lexeme(final String id, final Iterable<Token> tokens,
164 final Map<String, String> properties) {
165
166 this(id, tokens, Polarity.valueOf(properties.get("polarity").toUpperCase()), Boolean
167 .valueOf(properties.get("strong").toLowerCase()));
168 }
169
170 @Override
171 protected Map<String, String> getProperties() {
172 return ImmutableMap.of("polarity", this.polarity.toString(), "strong",
173 Boolean.toString(this.strong));
174 }
175
176 public Polarity getPolarity() {
177 return this.polarity;
178 }
179
180 public boolean isStrong() {
181 return this.strong;
182 }
183
184 }
185
186 public enum Polarity {
187
188 NEUTRAL,
189
190 POSITIVE,
191
192 NEGATIVE,
193
194 BOTH
195
196 }
197
198 }