1   package eu.fbk.dkm.pikes.resources.wes;
2   
3   import ixa.kaflib.KAFDocument;
4   import org.apache.commons.io.FileUtils;
5   import org.slf4j.Logger;
6   import org.slf4j.LoggerFactory;
7   import org.w3c.dom.Document;
8   import org.w3c.dom.Element;
9   
10  import javax.xml.parsers.DocumentBuilder;
11  import javax.xml.parsers.DocumentBuilderFactory;
12  import javax.xml.transform.Transformer;
13  import javax.xml.transform.TransformerFactory;
14  import javax.xml.transform.dom.DOMSource;
15  import javax.xml.transform.stream.StreamResult;
16  import java.io.File;
17  import java.util.Iterator;
18  import java.util.regex.Matcher;
19  import java.util.regex.Pattern;
20  
21  /**
22   * Created by alessio on 06/12/15.
23   */
24  
25  public class ConvertForSolr {
26  
27      private static final Logger LOGGER = LoggerFactory.getLogger(ConvertForSolr.class);
28      private static Pattern wesFilePattern = Pattern.compile("wes2015\\.d[0-9]+\\.naf");
29  
30      public static void main(String[] args) {
31  
32          String nafFolder = "/Users/alessio/Documents/Resources/wes/new";
33          String xmlFolder = "/Users/alessio/Documents/Resources/wes/xml-no-title";
34          String[] extensions = new String[] { "naf" };
35  
36          File nafFolderFile = new File(nafFolder);
37          File htmlFolderFile = new File(xmlFolder);
38  
39          try {
40              Iterator<File> fileIterator = FileUtils.iterateFiles(nafFolderFile, extensions, true);
41  
42              if (!htmlFolderFile.exists()) {
43                  htmlFolderFile.mkdirs();
44              }
45  
46              DocumentBuilderFactory docFactory = DocumentBuilderFactory.newInstance();
47              DocumentBuilder docBuilder = docFactory.newDocumentBuilder();
48              TransformerFactory transformerFactory = TransformerFactory.newInstance();
49              Transformer transformer = transformerFactory.newTransformer();
50  
51              fileIterator.forEachRemaining((File f) -> {
52                  File outputXml = new File(xmlFolder + File.separator + f.getName() + ".xml");
53  
54                  try {
55  
56                      Matcher m = wesFilePattern.matcher(f.getName());
57                      if (m.matches()) {
58                          KAFDocument document = KAFDocument.createFromFile(f);
59                          String title = document.getFileDesc().title;
60                          String text = document.getRawText().substring(title.length() + 1).trim();
61                          String id = document.getPublic().publicId;
62  
63                          Document doc = docBuilder.newDocument();
64                          Element moreRootElement = doc.createElement("add");
65                          Element rootElement = doc.createElement("doc");
66                          doc.appendChild(moreRootElement);
67                          moreRootElement.appendChild(rootElement);
68  
69                          Element idEl = doc.createElement("field");
70                          idEl.setAttribute("name", "id");
71                          idEl.setTextContent(id);
72                          rootElement.appendChild(idEl);
73  
74                          Element titleEl = doc.createElement("field");
75                          titleEl.setAttribute("name", "title");
76                          titleEl.setTextContent(title);
77                          rootElement.appendChild(titleEl);
78  
79                          Element textEl = doc.createElement("field");
80                          textEl.setAttribute("name", "text");
81                          textEl.setTextContent(text);
82                          rootElement.appendChild(textEl);
83  
84                          DOMSource source = new DOMSource(doc);
85                          StreamResult result = new StreamResult(outputXml);
86                          transformer.transform(source, result);
87                      }
88  
89                  } catch (Exception e) {
90                      e.printStackTrace();
91                  }
92              });
93  
94          } catch (Exception e) {
95              e.printStackTrace();
96          }
97      }
98  }