1 package eu.fbk.dkm.pikes.resources.trec;
2
3 import com.google.common.base.Charsets;
4 import com.google.common.io.Files;
5 import eu.fbk.utils.core.CommandLine;
6 import ixa.kaflib.KAFDocument;
7 import org.joox.JOOX;
8 import org.slf4j.Logger;
9 import org.slf4j.LoggerFactory;
10 import org.w3c.dom.Document;
11 import org.w3c.dom.Element;
12 import org.xml.sax.SAXException;
13
14 import javax.xml.parsers.DocumentBuilder;
15 import javax.xml.parsers.DocumentBuilderFactory;
16 import javax.xml.parsers.ParserConfigurationException;
17 import java.io.ByteArrayInputStream;
18 import java.io.File;
19 import java.io.IOException;
20 import java.io.InputStream;
21 import java.text.DateFormat;
22 import java.text.SimpleDateFormat;
23 import java.util.Calendar;
24 import java.util.Date;
25 import java.util.Locale;
26
27
28
29
30
31
32
33
34
35
36 public class FBIS {
37
38 private static final Logger LOGGER = LoggerFactory.getLogger(FBIS.class);
39 private static String DEFAULT_URL = "http://document/%s";
40 private static SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd'T'HH:mm:ss.SSSZ");
41 private static DateFormat format = new SimpleDateFormat("d MMMM yyyy", Locale.ENGLISH);
42
43 public static void main(String[] args) {
44
45 try {
46
47 final CommandLine cmd = CommandLine
48 .parser()
49 .withName("fbis-extractor")
50 .withHeader("Extract FBIS documents from TREC dataset and save them in NAF format")
51 .withOption("i", "input", "Input folder", "FOLDER", CommandLine.Type.DIRECTORY_EXISTING, true,
52 false, true)
53 .withOption("o", "output", "Output folder", "FOLDER", CommandLine.Type.DIRECTORY, true, false, true)
54 .withOption("u", "url-template", "URL template (with %d for the ID)", "URL",
55 CommandLine.Type.STRING, true, false, false)
56 .withLogger(LoggerFactory.getLogger("eu.fbk"))
57 .parse(args);
58
59 File inputDir = cmd.getOptionValue("input", File.class);
60
61 String urlTemplate = DEFAULT_URL;
62 if (cmd.hasOption("url-template")) {
63 urlTemplate = cmd.getOptionValue("url-template", String.class);
64 }
65
66 File outputDir = cmd.getOptionValue("output", File.class);
67 if (!outputDir.exists()) {
68 outputDir.mkdirs();
69 }
70
71 for (final File file : Files.fileTreeTraverser().preOrderTraversal(inputDir)) {
72 if (!file.isFile()) {
73 continue;
74 }
75 if (file.getName().startsWith(".")) {
76 continue;
77 }
78
79 String outputTemplate = outputDir.getAbsolutePath() + File.separator + file.getName();
80 File newFolder = new File(outputTemplate);
81 newFolder.mkdirs();
82
83 outputTemplate += File.separator + "NAF";
84 saveFile(file, outputTemplate, urlTemplate);
85 }
86 } catch (Exception e) {
87 CommandLine.fail(e);
88 }
89 }
90
91 private static void saveFile(File inputFile, String outputFilePattern, String urlTemplate)
92 throws IOException, SAXException, ParserConfigurationException {
93
94 LOGGER.info("Input file: {}", inputFile);
95
96 StringBuffer stringBuffer = new StringBuffer();
97 stringBuffer.append("<?xml version=\"1.0\"?>\n"
98 + "<!DOCTYPE tutorials [\n");
99 stringBuffer.append("<!ENTITY amp \" \">\n");
100 stringBuffer.append("<!ENTITY gt \" \">\n");
101 stringBuffer.append("<!ENTITY lt \" \">\n");
102 stringBuffer.append("<!ENTITY AElig \"A\">\n");
103 stringBuffer.append("<!ENTITY ap \" \">\n");
104 stringBuffer.append("<!ENTITY deg \" \">\n");
105 stringBuffer.append("<!ENTITY egrave \"e\">\n");
106 stringBuffer.append("<!ENTITY eacute \"e\">\n");
107 stringBuffer.append("<!ENTITY oacute \"o\">\n");
108 stringBuffer.append("<!ENTITY ubreve \"u\">\n");
109 stringBuffer.append("<!ENTITY Ubreve \"U\">\n");
110 stringBuffer.append("<!ENTITY egs \" \">\n");
111 stringBuffer.append("<!ENTITY els \" \">\n");
112 stringBuffer.append("<!ENTITY percnt \" \">\n");
113 stringBuffer.append("<!ENTITY pound \"£\">\n");
114 stringBuffer.append("<!ENTITY yen \"¥\">\n");
115 stringBuffer.append("<!ENTITY agr \"\">\n");
116 stringBuffer.append("<!ENTITY bgr \"\">\n");
117 stringBuffer.append("<!ENTITY dgr \"\">\n");
118 stringBuffer.append("<!ENTITY egr \"\">\n");
119 stringBuffer.append("<!ENTITY ggr \"\">\n");
120 stringBuffer.append("<!ENTITY Ggr \"\">\n");
121 stringBuffer.append("<!ENTITY kgr \"\">\n");
122 stringBuffer.append("<!ENTITY lgr \"\">\n");
123 stringBuffer.append("<!ENTITY mgr \"\">\n");
124 stringBuffer.append("<!ENTITY pgr \"\">\n");
125 stringBuffer.append("<!ENTITY rgr \"\">\n");
126 stringBuffer.append("<!ENTITY sgr \"\">\n");
127 stringBuffer.append("<!ENTITY tgr \"\">\n");
128 stringBuffer.append("<!ENTITY xgr \"\">\n");
129 stringBuffer.append("<!ENTITY zgr \"\">\n");
130 stringBuffer.append("<!ENTITY eegr \"\">\n");
131 stringBuffer.append("<!ENTITY khgr \"\">\n");
132 stringBuffer.append("<!ENTITY phgr \"\">\n");
133 stringBuffer.append("<!ENTITY thgr \"\">\n");
134 stringBuffer.append("<!ENTITY ohm \"\">\n");
135 stringBuffer.append("<!ENTITY Bgr \"\">\n");
136 stringBuffer.append("<!ENTITY Ngr \"\">\n");
137 stringBuffer.append("<!ENTITY EEgr \"\">\n");
138 stringBuffer.append("<!ENTITY OHgr \"\">\n");
139 stringBuffer.append("<!ENTITY PSgr \"\">\n");
140 stringBuffer.append("<!ENTITY Omacr \"\">\n");
141 stringBuffer.append("]>\n");
142 stringBuffer.append("<ROOT>\n");
143 stringBuffer.append(Files.toString(inputFile, Charsets.UTF_8)
144 .replaceAll("<F P=[0-9]+>", "<F>")
145 .replaceAll("<FIG ID=[^>]+>", "<FIG>")
146 .replaceAll("</?3>", "")
147 );
148 stringBuffer.append("\n</ROOT>\n");
149
150 InputStream is = new ByteArrayInputStream(stringBuffer.toString().getBytes());
151 DocumentBuilderFactory dbFactory = DocumentBuilderFactory.newInstance();
152 DocumentBuilder dBuilder = dbFactory.newDocumentBuilder();
153 Document doc = dBuilder.parse(is);
154
155 doc.getDocumentElement().normalize();
156
157 int i = 0;
158 for (Element element : JOOX.$(doc).find("DOC")) {
159 Element docnoElement = JOOX.$(element).find("DOCNO").get(0);
160 Element dateElement = JOOX.$(element).find("DATE1").get(0);
161 Element headlineElement = JOOX.$(element).find("TI").get(0);
162 Element textElement = JOOX.$(element).find("TEXT").get(0);
163
164
165 i++;
166 File outputFile = new File(outputFilePattern + "-" + i + ".naf");
167
168 if (textElement == null) {
169 LOGGER.error("TEXT is null");
170 continue;
171 }
172
173 String text = JOOX.$(element).find("TEXT").content();
174 if (text.length() == 0) {
175 LOGGER.error("TEXT is empty");
176 continue;
177 }
178
179 String docno = "";
180 if (docnoElement != null) {
181 docno = docnoElement.getTextContent().trim();
182 }
183
184 String date = "";
185 if (dateElement != null) {
186 date = dateElement.getTextContent().trim();
187 }
188
189 String headline = "";
190 if (headlineElement != null) {
191 headline = headlineElement.getTextContent().trim();
192 }
193
194 if (docno.equals("")) {
195 LOGGER.error("DOCNO is empty");
196 }
197
198 String url = String.format(urlTemplate, docno);
199
200 headline = headline.replace('\n', ' ');
201 headline = headline.replaceAll("\\s+", " ");
202 text = text.replace('\n', ' ');
203 text = text.replaceAll("\\s+", " ");
204
205 Date thisDate = null;
206 try {
207 thisDate = format.parse(date);
208 } catch (Exception e) {
209
210 }
211
212 text = headline + "\n\n" + text;
213
214 KAFDocument document = new KAFDocument("en", "v3");
215 document.setRawText(text);
216
217 KAFDocument.FileDesc fileDesc = document.createFileDesc();
218 fileDesc.title = headline;
219 if (thisDate != null) {
220 fileDesc.creationtime = sdf.format(thisDate);
221 }
222 KAFDocument.Public aPublic = document.createPublic();
223 aPublic.uri = url;
224 aPublic.publicId = docno;
225
226 document.save(outputFile.getAbsolutePath());
227 }
228 }
229 }