1   package eu.fbk.dkm.pikes.resources.wes;
2   
3   import com.fasterxml.jackson.databind.ObjectMapper;
4   import com.google.common.base.Charsets;
5   import com.google.common.io.Files;
6   import eu.fbk.utils.core.CommandLine;
7   import org.slf4j.Logger;
8   import org.slf4j.LoggerFactory;
9   
10  import java.io.*;
11  import java.net.HttpURLConnection;
12  import java.net.URL;
13  import java.net.URLEncoder;
14  import java.util.ArrayList;
15  import java.util.List;
16  import java.util.Map;
17  import java.util.regex.Matcher;
18  import java.util.regex.Pattern;
19  
20  /**
21   * Created by alessio on 11/12/15.
22   */
23  
24  public class QueryGoogle {
25  
26      private static final Logger LOGGER = LoggerFactory.getLogger(QueryGoogle.class);
27      //    private static String GoogleKey = "";
28  //    private static String GoogleCx = "";
29      private static String DEFAULT_USER_AGENT = "FBK evaluation";
30  
31  //    private static String nafQueriesFileName = "/Users/alessio/Documents/Resources/wes/wes2015.queries.or.txt";
32  //    private static String outputFileName = "/Users/alessio/Documents/Resources/wes/google-or.txt";
33  
34      private static Pattern wesPattern = Pattern.compile("wes2015\\.(d[0-9]+)\\.naf\\.html");
35  
36      private static void sendGet(String query, ArrayList<String> listWithLinks, String googleKey, String googleCx,
37              String agent) throws Exception {
38          sendGet(query, listWithLinks, googleKey, googleCx, agent, 0);
39      }
40  
41      private static void sendGet(String query, ArrayList<String> listWithLinks, String googleKey, String googleCx,
42              String agent, int start) throws Exception {
43  
44          StringBuffer url = new StringBuffer();
45          url.append("https://www.googleapis.com/customsearch/v1?key=");
46          url.append(googleKey);
47          url.append("&cx=").append(googleCx);
48          url.append("&q=").append(URLEncoder.encode(query, "UTF-8"));
49          if (start > 0) {
50              url.append("&start=").append(start);
51          }
52  
53          URL obj = new URL(url.toString());
54          HttpURLConnection con = (HttpURLConnection) obj.openConnection();
55  
56          // optional default is GET
57          con.setRequestMethod("GET");
58  
59          //add request header
60          con.setRequestProperty("User-Agent", agent);
61  
62          int responseCode = con.getResponseCode();
63          LOGGER.info("Queried Google [{}], response code {}", url, responseCode);
64  
65          BufferedReader in = new BufferedReader(
66                  new InputStreamReader(con.getInputStream()));
67          String inputLine;
68          StringBuffer response = new StringBuffer();
69  
70          while ((inputLine = in.readLine()) != null) {
71              response.append(inputLine);
72          }
73          in.close();
74  
75          ObjectMapper mapper = new ObjectMapper();
76          Map<?, ?> root = mapper.readValue(response.toString(), Map.class);
77          ArrayList<?> items = (ArrayList) root.get("items");
78          if (items != null) {
79              for (Object item : items) {
80                  String link = (String) ((Map<?, ?>) item).get("link");
81                  listWithLinks.add(link);
82              }
83          }
84  
85          // Check for next page
86          Map<?, ?> queries = (Map) root.get("queries");
87          if (queries.containsKey("nextPage")) {
88              ArrayList<?> nextPageArray = (ArrayList) queries.get("nextPage");
89              Map<?, ?> nextPage = (Map) nextPageArray.get(0);
90              int nextStart = (Integer) nextPage.get("startIndex");
91              sendGet(query, listWithLinks, googleKey, googleCx, agent, nextStart);
92          }
93      }
94  
95      public static void main(String[] args) {
96          try {
97              final CommandLine cmd = CommandLine
98                      .parser()
99                      .withName("query-solr")
100                     .withHeader("Send WES queries to a Solr server")
101                     .withOption("q", "queries", "CSV file with queries", "FILE", CommandLine.Type.FILE_EXISTING, true,
102                             false, true)
103                     .withOption("o", "output", "Output file", "FILE", CommandLine.Type.FILE, true, false, true)
104                     .withOption("k", "google-key", "Google key", "STRING", CommandLine.Type.STRING, true, false, true)
105                     .withOption("c", "google-cx", "Google CX", "STRING", CommandLine.Type.STRING, true, false, true)
106                     .withOption("a", "agent", String.format("User agent, default %s", DEFAULT_USER_AGENT), "STRING",
107                             CommandLine.Type.STRING, true, false, false)
108                     .withLogger(LoggerFactory.getLogger("eu.fbk")) //
109                     .parse(args);
110 
111             File outputFile = cmd.getOptionValue("output", File.class);
112             File nafQueriesFile = cmd.getOptionValue("queries", File.class);
113             String userAgent = cmd.getOptionValue("agent", String.class);
114 
115             String googleKey = cmd.getOptionValue("google-key", String.class);
116             String googleCx = cmd.getOptionValue("google-cx", String.class);
117 
118             BufferedWriter writer = new BufferedWriter(new FileWriter(outputFile));
119 
120             List<String> lines = Files.readLines(nafQueriesFile, Charsets.UTF_8);
121             for (String line : lines) {
122                 line = line.trim();
123                 if (line.length() == 0) {
124                     continue;
125                 }
126 
127                 if (line.startsWith("#")) {
128                     continue;
129                 }
130 
131                 String[] parts = line.split("\\t");
132                 String id = parts[0];
133                 String query = parts[1];
134 
135                 ArrayList<String> links = new ArrayList<>();
136                 sendGet(query, links, googleKey, googleCx, userAgent);
137 
138                 writer.append(id);
139                 for (String link : links) {
140                     Matcher matcher = wesPattern.matcher(link);
141                     if (matcher.find()) {
142                         writer.append("\t").append(matcher.group(1));
143                     }
144                 }
145 
146                 writer.append("\n");
147             }
148 
149             writer.close();
150 
151 //            sendGet("fame", links);
152 
153         } catch (Exception e) {
154             CommandLine.fail(e);
155         }
156     }
157 }