1 package eu.fbk.dkm.pikes.resources.darmstadt;
2
3 import com.google.common.base.Charsets;
4 import com.google.common.io.Files;
5 import eu.fbk.utils.core.CommandLine;
6 import eu.fbk.dkm.pikes.naflib.Corpus;
7 import ixa.kaflib.KAFDocument;
8 import org.slf4j.LoggerFactory;
9 import org.w3c.dom.Document;
10 import org.w3c.dom.Node;
11 import org.w3c.dom.NodeList;
12
13 import javax.xml.parsers.DocumentBuilder;
14 import javax.xml.parsers.DocumentBuilderFactory;
15 import java.io.File;
16 import java.nio.file.Path;
17 import java.util.HashMap;
18
19
20
21
22
23 public class IdentifyDocuments {
24
25 private static final org.slf4j.Logger LOGGER = LoggerFactory.getLogger(IdentifyDocuments.class);
26
27 private static int minimum(int a, int b, int c) {
28 return Math.min(Math.min(a, b), c);
29 }
30
31 public static int computeLevenshteinDistance(String str1, String str2) {
32 int[][] distance = new int[str1.length() + 1][str2.length() + 1];
33
34 for (int i = 0; i <= str1.length(); i++) {
35 distance[i][0] = i;
36 }
37 for (int j = 1; j <= str2.length(); j++) {
38 distance[0][j] = j;
39 }
40
41 for (int i = 1; i <= str1.length(); i++) {
42 for (int j = 1; j <= str2.length(); j++) {
43 distance[i][j] = minimum(
44 distance[i - 1][j] + 1,
45 distance[i][j - 1] + 1,
46 distance[i - 1][j - 1] + ((str1.charAt(i - 1) == str2.charAt(j - 1)) ? 0 : 1));
47 }
48 }
49
50 return distance[str1.length()][str2.length()];
51 }
52
53 public static void main(String[] args) {
54 try {
55 final CommandLine cmd = CommandLine
56 .parser()
57 .withName("yamcha-extractor")
58 .withHeader("Check ESWC dataset with Darmstadt")
59 .withOption("i", "input-folder", "the folder of the NAF corpus", "DIR", CommandLine.Type.DIRECTORY_EXISTING, true, false, true)
60 .withOption("d", "dataset-file", "the XML file provided from the task organizers", "FILE", CommandLine.Type.FILE_EXISTING, true, false, true)
61 .withOption("o", "output-file", "output file", "FILE", CommandLine.Type.FILE, true, false, true)
62 .withLogger(LoggerFactory.getLogger("eu.fbk")).parse(args);
63
64 File inputFolder = cmd.getOptionValue("input-folder", File.class);
65 File datasetFile = cmd.getOptionValue("dataset-file", File.class);
66 File outputFile = cmd.getOptionValue("output-file", File.class);
67
68 HashMap<String, String> textToFile = new HashMap<>();
69
70 Corpus corpus = Corpus.create(false, inputFolder);
71 for (Path file : corpus.files()) {
72
73
74
75
76
77 KAFDocument document = KAFDocument.createFromFile(file.toFile());
78 String text = document.getRawText();
79 text = text.replaceAll("[^a-zA-Z]", "");
80 textToFile.put(text, file.toFile().getName());
81 }
82
83 StringBuffer buffer = new StringBuffer();
84
85 DocumentBuilderFactory dbFactory = DocumentBuilderFactory.newInstance();
86 DocumentBuilder dBuilder = dbFactory.newDocumentBuilder();
87 Document doc = dBuilder.parse(datasetFile);
88 NodeList nList = doc.getElementsByTagName("text");
89 for (int temp = 0; temp < nList.getLength(); temp++) {
90 Node nNode = nList.item(temp);
91 if (nNode.getNodeType() != Node.ELEMENT_NODE) {
92 continue;
93 }
94 String text = nNode.getTextContent();
95
96
97
98 text = text.replaceAll("[^a-zA-Z]", "");
99 if (textToFile.keySet().contains(text)) {
100 buffer.append(textToFile.get(text)).append("\n");
101 }
102 else {
103
104 int found = 0;
105 String fileFound = null;
106
107 for (String key : textToFile.keySet()) {
108 int distance = computeLevenshteinDistance(key, text);
109 double ratio = (distance * 1.0) / (key.length() * 1.0);
110 if (ratio < 0.02) {
111 found++;
112 fileFound = key;
113 }
114 }
115
116 if (found == 1) {
117 buffer.append(textToFile.get(fileFound)).append("\n");
118 }
119 else {
120 System.out.println("---");
121 System.out.println(nNode.getTextContent());
122 System.out.println("NOT FOUND!");
123 }
124 }
125 }
126
127 Files.write(buffer.toString(), outputFile, Charsets.UTF_8);
128
129 } catch (final Throwable ex) {
130 CommandLine.fail(ex);
131 }
132 }
133 }