1 package eu.fbk.dkm.pikes.resources.wes;
2
3 import ixa.kaflib.KAFDocument;
4 import org.apache.commons.io.FileUtils;
5 import org.slf4j.Logger;
6 import org.slf4j.LoggerFactory;
7 import org.w3c.dom.Document;
8 import org.w3c.dom.Element;
9
10 import javax.xml.parsers.DocumentBuilder;
11 import javax.xml.parsers.DocumentBuilderFactory;
12 import javax.xml.transform.Transformer;
13 import javax.xml.transform.TransformerFactory;
14 import javax.xml.transform.dom.DOMSource;
15 import javax.xml.transform.stream.StreamResult;
16 import java.io.File;
17 import java.util.Iterator;
18 import java.util.regex.Matcher;
19 import java.util.regex.Pattern;
20
21
22
23
24
25 public class ConvertForSolr {
26
27 private static final Logger LOGGER = LoggerFactory.getLogger(ConvertForSolr.class);
28 private static Pattern wesFilePattern = Pattern.compile("wes2015\\.d[0-9]+\\.naf");
29
30 public static void main(String[] args) {
31
32 String nafFolder = "/Users/alessio/Documents/Resources/wes/new";
33 String xmlFolder = "/Users/alessio/Documents/Resources/wes/xml-no-title";
34 String[] extensions = new String[] { "naf" };
35
36 File nafFolderFile = new File(nafFolder);
37 File htmlFolderFile = new File(xmlFolder);
38
39 try {
40 Iterator<File> fileIterator = FileUtils.iterateFiles(nafFolderFile, extensions, true);
41
42 if (!htmlFolderFile.exists()) {
43 htmlFolderFile.mkdirs();
44 }
45
46 DocumentBuilderFactory docFactory = DocumentBuilderFactory.newInstance();
47 DocumentBuilder docBuilder = docFactory.newDocumentBuilder();
48 TransformerFactory transformerFactory = TransformerFactory.newInstance();
49 Transformer transformer = transformerFactory.newTransformer();
50
51 fileIterator.forEachRemaining((File f) -> {
52 File outputXml = new File(xmlFolder + File.separator + f.getName() + ".xml");
53
54 try {
55
56 Matcher m = wesFilePattern.matcher(f.getName());
57 if (m.matches()) {
58 KAFDocument document = KAFDocument.createFromFile(f);
59 String title = document.getFileDesc().title;
60 String text = document.getRawText().substring(title.length() + 1).trim();
61 String id = document.getPublic().publicId;
62
63 Document doc = docBuilder.newDocument();
64 Element moreRootElement = doc.createElement("add");
65 Element rootElement = doc.createElement("doc");
66 doc.appendChild(moreRootElement);
67 moreRootElement.appendChild(rootElement);
68
69 Element idEl = doc.createElement("field");
70 idEl.setAttribute("name", "id");
71 idEl.setTextContent(id);
72 rootElement.appendChild(idEl);
73
74 Element titleEl = doc.createElement("field");
75 titleEl.setAttribute("name", "title");
76 titleEl.setTextContent(title);
77 rootElement.appendChild(titleEl);
78
79 Element textEl = doc.createElement("field");
80 textEl.setAttribute("name", "text");
81 textEl.setTextContent(text);
82 rootElement.appendChild(textEl);
83
84 DOMSource source = new DOMSource(doc);
85 StreamResult result = new StreamResult(outputXml);
86 transformer.transform(source, result);
87 }
88
89 } catch (Exception e) {
90 e.printStackTrace();
91 }
92 });
93
94 } catch (Exception e) {
95 e.printStackTrace();
96 }
97 }
98 }