1   package eu.fbk.dkm.pikes.resources.darmstadt;
2   
3   import com.google.common.base.Charsets;
4   import com.google.common.io.Files;
5   import eu.fbk.utils.core.CommandLine;
6   import eu.fbk.dkm.pikes.naflib.Corpus;
7   import ixa.kaflib.KAFDocument;
8   import org.slf4j.LoggerFactory;
9   import org.w3c.dom.Document;
10  import org.w3c.dom.Node;
11  import org.w3c.dom.NodeList;
12  
13  import javax.xml.parsers.DocumentBuilder;
14  import javax.xml.parsers.DocumentBuilderFactory;
15  import java.io.File;
16  import java.nio.file.Path;
17  import java.util.HashMap;
18  
19  /**
20   * Created by alessio on 25/05/15.
21   */
22  
23  public class IdentifyDocuments {
24  
25  	private static final org.slf4j.Logger LOGGER = LoggerFactory.getLogger(IdentifyDocuments.class);
26  
27  	private static int minimum(int a, int b, int c) {
28  		return Math.min(Math.min(a, b), c);
29  	}
30  
31  	public static int computeLevenshteinDistance(String str1, String str2) {
32  		int[][] distance = new int[str1.length() + 1][str2.length() + 1];
33  
34  		for (int i = 0; i <= str1.length(); i++) {
35  			distance[i][0] = i;
36  		}
37  		for (int j = 1; j <= str2.length(); j++) {
38  			distance[0][j] = j;
39  		}
40  
41  		for (int i = 1; i <= str1.length(); i++) {
42  			for (int j = 1; j <= str2.length(); j++) {
43  				distance[i][j] = minimum(
44  						distance[i - 1][j] + 1,
45  						distance[i][j - 1] + 1,
46  						distance[i - 1][j - 1] + ((str1.charAt(i - 1) == str2.charAt(j - 1)) ? 0 : 1));
47  			}
48  		}
49  
50  		return distance[str1.length()][str2.length()];
51  	}
52  
53  	public static void main(String[] args) {
54  		try {
55  			final CommandLine cmd = CommandLine
56  					.parser()
57  					.withName("yamcha-extractor")
58  					.withHeader("Check ESWC dataset with Darmstadt")
59  					.withOption("i", "input-folder", "the folder of the NAF corpus", "DIR", CommandLine.Type.DIRECTORY_EXISTING, true, false, true)
60  					.withOption("d", "dataset-file", "the XML file provided from the task organizers", "FILE", CommandLine.Type.FILE_EXISTING, true, false, true)
61  					.withOption("o", "output-file", "output file", "FILE", CommandLine.Type.FILE, true, false, true)
62  					.withLogger(LoggerFactory.getLogger("eu.fbk")).parse(args);
63  
64  			File inputFolder = cmd.getOptionValue("input-folder", File.class);
65  			File datasetFile = cmd.getOptionValue("dataset-file", File.class);
66  			File outputFile = cmd.getOptionValue("output-file", File.class);
67  
68  			HashMap<String, String> textToFile = new HashMap<>();
69  
70  			Corpus corpus = Corpus.create(false, inputFolder);
71  			for (Path file : corpus.files()) {
72  
73  //				if (!file.toFile().getAbsolutePath().contains("webs-review-66EE-776CCC4-39995BC2-prod6")) {
74  //					continue;
75  //				}
76  
77  				KAFDocument document = KAFDocument.createFromFile(file.toFile());
78  				String text = document.getRawText();
79  				text = text.replaceAll("[^a-zA-Z]", "");
80  				textToFile.put(text, file.toFile().getName());
81  			}
82  
83  			StringBuffer buffer = new StringBuffer();
84  
85  			DocumentBuilderFactory dbFactory = DocumentBuilderFactory.newInstance();
86  			DocumentBuilder dBuilder = dbFactory.newDocumentBuilder();
87  			Document doc = dBuilder.parse(datasetFile);
88  			NodeList nList = doc.getElementsByTagName("text");
89  			for (int temp = 0; temp < nList.getLength(); temp++) {
90  				Node nNode = nList.item(temp);
91  				if (nNode.getNodeType() != Node.ELEMENT_NODE) {
92  					continue;
93  				}
94  				String text = nNode.getTextContent();
95  //				if (!text.contains("http://www.epinions.com/webs-review-66EE-776CCC4-39995BC2-prod6")) {
96  //					continue;
97  //				}
98  				text = text.replaceAll("[^a-zA-Z]", "");
99  				if (textToFile.keySet().contains(text)) {
100 					buffer.append(textToFile.get(text)).append("\n");
101 				}
102 				else {
103 
104 					int found = 0;
105 					String fileFound = null;
106 
107 					for (String key : textToFile.keySet()) {
108 						int distance = computeLevenshteinDistance(key, text);
109 						double ratio = (distance * 1.0) / (key.length() * 1.0);
110 						if (ratio < 0.02) {
111 							found++;
112 							fileFound = key;
113 						}
114 					}
115 
116 					if (found == 1) {
117 						buffer.append(textToFile.get(fileFound)).append("\n");
118 					}
119 					else {
120 						System.out.println("---");
121 						System.out.println(nNode.getTextContent());
122 						System.out.println("NOT FOUND!");
123 					}
124 				}
125 			}
126 
127 			Files.write(buffer.toString(), outputFile, Charsets.UTF_8);
128 
129 		} catch (final Throwable ex) {
130 			CommandLine.fail(ex);
131 		}
132 	}
133 }