1 package eu.fbk.dkm.pikes.resources.wes;
2
3 import com.fasterxml.jackson.databind.ObjectMapper;
4 import com.google.common.base.Charsets;
5 import com.google.common.io.Files;
6 import eu.fbk.utils.core.CommandLine;
7 import org.slf4j.Logger;
8 import org.slf4j.LoggerFactory;
9
10 import java.io.*;
11 import java.net.HttpURLConnection;
12 import java.net.URL;
13 import java.net.URLEncoder;
14 import java.util.ArrayList;
15 import java.util.List;
16 import java.util.Map;
17 import java.util.regex.Matcher;
18 import java.util.regex.Pattern;
19
20
21
22
23
24 public class QueryGoogle {
25
26 private static final Logger LOGGER = LoggerFactory.getLogger(QueryGoogle.class);
27
28
29 private static String DEFAULT_USER_AGENT = "FBK evaluation";
30
31
32
33
34 private static Pattern wesPattern = Pattern.compile("wes2015\\.(d[0-9]+)\\.naf\\.html");
35
36 private static void sendGet(String query, ArrayList<String> listWithLinks, String googleKey, String googleCx,
37 String agent) throws Exception {
38 sendGet(query, listWithLinks, googleKey, googleCx, agent, 0);
39 }
40
41 private static void sendGet(String query, ArrayList<String> listWithLinks, String googleKey, String googleCx,
42 String agent, int start) throws Exception {
43
44 StringBuffer url = new StringBuffer();
45 url.append("https://www.googleapis.com/customsearch/v1?key=");
46 url.append(googleKey);
47 url.append("&cx=").append(googleCx);
48 url.append("&q=").append(URLEncoder.encode(query, "UTF-8"));
49 if (start > 0) {
50 url.append("&start=").append(start);
51 }
52
53 URL obj = new URL(url.toString());
54 HttpURLConnection con = (HttpURLConnection) obj.openConnection();
55
56
57 con.setRequestMethod("GET");
58
59
60 con.setRequestProperty("User-Agent", agent);
61
62 int responseCode = con.getResponseCode();
63 LOGGER.info("Queried Google [{}], response code {}", url, responseCode);
64
65 BufferedReader in = new BufferedReader(
66 new InputStreamReader(con.getInputStream()));
67 String inputLine;
68 StringBuffer response = new StringBuffer();
69
70 while ((inputLine = in.readLine()) != null) {
71 response.append(inputLine);
72 }
73 in.close();
74
75 ObjectMapper mapper = new ObjectMapper();
76 Map<?, ?> root = mapper.readValue(response.toString(), Map.class);
77 ArrayList<?> items = (ArrayList) root.get("items");
78 if (items != null) {
79 for (Object item : items) {
80 String link = (String) ((Map<?, ?>) item).get("link");
81 listWithLinks.add(link);
82 }
83 }
84
85
86 Map<?, ?> queries = (Map) root.get("queries");
87 if (queries.containsKey("nextPage")) {
88 ArrayList<?> nextPageArray = (ArrayList) queries.get("nextPage");
89 Map<?, ?> nextPage = (Map) nextPageArray.get(0);
90 int nextStart = (Integer) nextPage.get("startIndex");
91 sendGet(query, listWithLinks, googleKey, googleCx, agent, nextStart);
92 }
93 }
94
95 public static void main(String[] args) {
96 try {
97 final CommandLine cmd = CommandLine
98 .parser()
99 .withName("query-solr")
100 .withHeader("Send WES queries to a Solr server")
101 .withOption("q", "queries", "CSV file with queries", "FILE", CommandLine.Type.FILE_EXISTING, true,
102 false, true)
103 .withOption("o", "output", "Output file", "FILE", CommandLine.Type.FILE, true, false, true)
104 .withOption("k", "google-key", "Google key", "STRING", CommandLine.Type.STRING, true, false, true)
105 .withOption("c", "google-cx", "Google CX", "STRING", CommandLine.Type.STRING, true, false, true)
106 .withOption("a", "agent", String.format("User agent, default %s", DEFAULT_USER_AGENT), "STRING",
107 CommandLine.Type.STRING, true, false, false)
108 .withLogger(LoggerFactory.getLogger("eu.fbk"))
109 .parse(args);
110
111 File outputFile = cmd.getOptionValue("output", File.class);
112 File nafQueriesFile = cmd.getOptionValue("queries", File.class);
113 String userAgent = cmd.getOptionValue("agent", String.class);
114
115 String googleKey = cmd.getOptionValue("google-key", String.class);
116 String googleCx = cmd.getOptionValue("google-cx", String.class);
117
118 BufferedWriter writer = new BufferedWriter(new FileWriter(outputFile));
119
120 List<String> lines = Files.readLines(nafQueriesFile, Charsets.UTF_8);
121 for (String line : lines) {
122 line = line.trim();
123 if (line.length() == 0) {
124 continue;
125 }
126
127 if (line.startsWith("#")) {
128 continue;
129 }
130
131 String[] parts = line.split("\\t");
132 String id = parts[0];
133 String query = parts[1];
134
135 ArrayList<String> links = new ArrayList<>();
136 sendGet(query, links, googleKey, googleCx, userAgent);
137
138 writer.append(id);
139 for (String link : links) {
140 Matcher matcher = wesPattern.matcher(link);
141 if (matcher.find()) {
142 writer.append("\t").append(matcher.group(1));
143 }
144 }
145
146 writer.append("\n");
147 }
148
149 writer.close();
150
151
152
153 } catch (Exception e) {
154 CommandLine.fail(e);
155 }
156 }
157 }