1 package eu.fbk.dkm.pikes.resources.trec;
2
3 import com.google.common.base.Charsets;
4 import com.google.common.io.Files;
5 import eu.fbk.utils.core.CommandLine;
6 import ixa.kaflib.KAFDocument;
7 import org.joox.JOOX;
8 import org.slf4j.Logger;
9 import org.slf4j.LoggerFactory;
10 import org.w3c.dom.Document;
11 import org.w3c.dom.Element;
12 import org.xml.sax.SAXException;
13
14 import javax.xml.parsers.DocumentBuilder;
15 import javax.xml.parsers.DocumentBuilderFactory;
16 import javax.xml.parsers.ParserConfigurationException;
17 import java.io.ByteArrayInputStream;
18 import java.io.File;
19 import java.io.IOException;
20 import java.io.InputStream;
21 import java.text.SimpleDateFormat;
22 import java.util.Calendar;
23 import java.util.regex.Matcher;
24 import java.util.regex.Pattern;
25
26
27
28
29
30
31
32
33
34
35 public class FR94 {
36
37 private static final Logger LOGGER = LoggerFactory.getLogger(FR94.class);
38 private static String DEFAULT_URL = "http://document/%s";
39
40 public static void main(String[] args) {
41
42 try {
43
44 final CommandLine cmd = CommandLine
45 .parser()
46 .withName("fr94-extractor")
47 .withHeader("Extract FR94 documents from TREC dataset and save them in NAF format")
48 .withOption("i", "input", "Input folder", "FOLDER", CommandLine.Type.DIRECTORY_EXISTING, true,
49 false, true)
50 .withOption("o", "output", "Output folder", "FOLDER", CommandLine.Type.DIRECTORY, true, false, true)
51 .withOption("u", "url-template", "URL template (with %d for the ID)", "URL",
52 CommandLine.Type.STRING, true, false, false)
53 .withLogger(LoggerFactory.getLogger("eu.fbk"))
54 .parse(args);
55
56 File inputDir = cmd.getOptionValue("input", File.class);
57
58 String urlTemplate = DEFAULT_URL;
59 if (cmd.hasOption("url-template")) {
60 urlTemplate = cmd.getOptionValue("url-template", String.class);
61 }
62
63 File outputDir = cmd.getOptionValue("output", File.class);
64 if (!outputDir.exists()) {
65 outputDir.mkdirs();
66 }
67
68 for (final File file : Files.fileTreeTraverser().preOrderTraversal(inputDir)) {
69 if (!file.isFile()) {
70 continue;
71 }
72 if (file.getName().startsWith(".")) {
73 continue;
74 }
75
76 String outputTemplate = outputDir.getAbsolutePath() + File.separator + file.getName();
77 File newFolder = new File(outputTemplate);
78 newFolder.mkdirs();
79
80 outputTemplate += File.separator + "NAF";
81 saveFile(file, outputTemplate, urlTemplate);
82 }
83 } catch (Exception e) {
84 CommandLine.fail(e);
85 }
86 }
87
88 private static void saveFile(File inputFile, String outputFilePattern, String urlTemplate)
89 throws IOException, SAXException, ParserConfigurationException {
90
91 LOGGER.info("Input file: {}", inputFile);
92
93 StringBuffer stringBuffer = new StringBuffer();
94 stringBuffer.append("<?xml version=\"1.0\"?>\n"
95 + "<!DOCTYPE tutorials [\n");
96 stringBuffer.append("<!ENTITY hyph \"-\">\n");
97 stringBuffer.append("<!ENTITY blank \" \">\n");
98 stringBuffer.append("<!ENTITY sect \" \">\n");
99 stringBuffer.append("<!ENTITY para \" \">\n");
100 stringBuffer.append("<!ENTITY cir \" \">\n");
101 stringBuffer.append("<!ENTITY rsquo \" \">\n");
102 stringBuffer.append("<!ENTITY mu \" \">\n");
103 stringBuffer.append("<!ENTITY times \" \">\n");
104 stringBuffer.append("<!ENTITY bull \" \">\n");
105 stringBuffer.append("<!ENTITY ge \">=\">\n");
106 stringBuffer.append("<!ENTITY reg \" \">\n");
107 stringBuffer.append("<!ENTITY cent \" \">\n");
108 stringBuffer.append("<!ENTITY amp \" \">\n");
109 stringBuffer.append("<!ENTITY gt \">\">\n");
110 stringBuffer.append("<!ENTITY lt \"<\">\n");
111 stringBuffer.append("<!ENTITY acirc \"a\">\n");
112 stringBuffer.append("<!ENTITY ncirc \"n\">\n");
113 stringBuffer.append("<!ENTITY atilde \"a\">\n");
114 stringBuffer.append("<!ENTITY ntilde \"n\">\n");
115 stringBuffer.append("<!ENTITY otilde \"o\">\n");
116 stringBuffer.append("<!ENTITY utilde \"u\">\n");
117 stringBuffer.append("<!ENTITY aacute \"a\">\n");
118 stringBuffer.append("<!ENTITY cacute \"c\">\n");
119 stringBuffer.append("<!ENTITY eacute \"e\">\n");
120 stringBuffer.append("<!ENTITY Eacute \"E\">\n");
121 stringBuffer.append("<!ENTITY Gacute \"G\">\n");
122 stringBuffer.append("<!ENTITY iacute \"i\">\n");
123 stringBuffer.append("<!ENTITY lacute \"l\">\n");
124 stringBuffer.append("<!ENTITY nacute \"n\">\n");
125 stringBuffer.append("<!ENTITY oacute \"o\">\n");
126 stringBuffer.append("<!ENTITY pacute \"p\">\n");
127 stringBuffer.append("<!ENTITY racute \"r\">\n");
128 stringBuffer.append("<!ENTITY sacute \"s\">\n");
129 stringBuffer.append("<!ENTITY uacute \"u\">\n");
130 stringBuffer.append("<!ENTITY ocirc \"o\">\n");
131 stringBuffer.append("<!ENTITY auml \"a\">\n");
132 stringBuffer.append("<!ENTITY euml \"e\">\n");
133 stringBuffer.append("<!ENTITY Euml \"E\">\n");
134 stringBuffer.append("<!ENTITY iuml \"i\">\n");
135 stringBuffer.append("<!ENTITY Iuml \"I\">\n");
136 stringBuffer.append("<!ENTITY Kuml \"K\">\n");
137 stringBuffer.append("<!ENTITY Ouml \"O\">\n");
138 stringBuffer.append("<!ENTITY ouml \"o\">\n");
139 stringBuffer.append("<!ENTITY uuml \"u\">\n");
140 stringBuffer.append("<!ENTITY Ccedil \"C\">\n");
141 stringBuffer.append("<!ENTITY ccedil \"c\">\n");
142 stringBuffer.append("<!ENTITY agrave \"a\">\n");
143 stringBuffer.append("<!ENTITY Agrave \"A\">\n");
144 stringBuffer.append("<!ENTITY egrave \"e\">\n");
145 stringBuffer.append("<!ENTITY Egrave \"E\">\n");
146 stringBuffer.append("<!ENTITY igrave \"i\">\n");
147 stringBuffer.append("<!ENTITY Ograve \"O\">\n");
148 stringBuffer.append("<!ENTITY ograve \"o\">\n");
149 stringBuffer.append("<!ENTITY ugrave \"u\">\n");
150 stringBuffer.append("]>\n");
151 stringBuffer.append("<ROOT>\n");
152 stringBuffer.append(Files.toString(inputFile, Charsets.UTF_8));
153 stringBuffer.append("\n</ROOT>\n");
154
155 InputStream is = new ByteArrayInputStream(stringBuffer.toString().getBytes());
156 DocumentBuilderFactory dbFactory = DocumentBuilderFactory.newInstance();
157 DocumentBuilder dBuilder = dbFactory.newDocumentBuilder();
158 Document doc = dBuilder.parse(is);
159
160 doc.getDocumentElement().normalize();
161
162 int i = 0;
163 for (Element element : JOOX.$(doc).find("DOC")) {
164 Element docnoElement = JOOX.$(element).find("DOCNO").get(0);
165 Element textElement = JOOX.$(element).find("TEXT").get(0);
166
167
168 i++;
169 File outputFile = new File(outputFilePattern + "-" + i + ".naf");
170
171 if (textElement == null) {
172 LOGGER.error("TEXT is null");
173 continue;
174 }
175
176 String text = textElement.getTextContent().trim();
177
178 String docno = "";
179 if (docnoElement != null) {
180 docno = docnoElement.getTextContent().trim();
181 }
182
183 if (docno.equals("")) {
184 LOGGER.error("DOCNO is empty");
185 }
186
187 String url = String.format(urlTemplate, docno);
188
189 text = text.replaceAll("([^\\n])\\n([^\\n])", "$1 $2");
190 text = text.replaceAll("\\n+([a-z])", " $1");
191
192 KAFDocument document = new KAFDocument("en", "v3");
193 document.setRawText(text);
194
195 KAFDocument.FileDesc fileDesc = document.createFileDesc();
196 fileDesc.title = docno;
197 KAFDocument.Public aPublic = document.createPublic();
198 aPublic.uri = url;
199 aPublic.publicId = docno;
200
201 document.save(outputFile.getAbsolutePath());
202 }
203 }
204 }