1   package eu.fbk.dkm.pikes.resources.darmstadt;
2   
3   import com.google.common.base.Charsets;
4   import com.google.common.io.Files;
5   import eu.fbk.rdfpro.util.Statements;
6   import eu.fbk.utils.core.CommandLine;
7   import ixa.kaflib.KAFDocument;
8   import org.apache.commons.io.FileUtils;
9   import org.apache.commons.io.FilenameUtils;
10  import org.slf4j.LoggerFactory;
11  import org.w3c.dom.Document;
12  import org.w3c.dom.Element;
13  import org.w3c.dom.NodeList;
14  
15  import javax.xml.parsers.DocumentBuilder;
16  import javax.xml.parsers.DocumentBuilderFactory;
17  import java.io.ByteArrayInputStream;
18  import java.io.File;
19  import java.util.HashSet;
20  import java.util.Iterator;
21  
22  /**
23   * Created by alessio on 10/04/15.
24   */
25  
26  public class CorpusLoader {
27  
28  	private static final org.slf4j.Logger LOGGER = LoggerFactory.getLogger(CorpusLoader.class);
29  	public static final String DEFAULT_NAMESPACE = "https://www.ukp.tu-eu.fbk.dkm.pikes.resources.darmstadt.de/eu.fbk.dkm.pikes.resources.darmstadt-service-review-corpus/";
30  	public static final String[] MMAX_PATTERN = new String[]{"basedata", "markables"};
31  	public static final String[] MMAX_SUFFIXES = new String[]{"_words", "_OpinionExpression_level"};
32  
33  	private static void getFilesRecursive(File pFile, HashSet<String> folders) {
34  		for (File file : pFile.listFiles()) {
35  			if (file.isDirectory()) {
36  				folders.add(file.getAbsolutePath());
37  				getFilesRecursive(file, folders);
38  			}
39  		}
40  	}
41  
42  	public static void main(String[] args) {
43  		try {
44  			final CommandLine cmd = CommandLine
45  					.parser()
46  					.withName("eu.fbk.dkm.pikes.resources.darmstadt-loader")
47  					.withHeader("Load eu.fbk.dkm.pikes.resources.darmstadt-service-review-corpus")
48  					.withOption("i", "input-folder", "the folder of the corpus", "DIR", CommandLine.Type.DIRECTORY_EXISTING, true, false, true)
49  					.withOption("n", "namespace", String.format("the namespace for generating document URIs, default %s", DEFAULT_NAMESPACE), "NS", CommandLine.Type.STRING, true, false, false)
50  					.withLogger(LoggerFactory.getLogger("eu.fbk.fssa")).parse(args);
51  
52  			final File inputFile = cmd.getOptionValue("i", File.class);
53  
54  			String namespace = DEFAULT_NAMESPACE;
55  			if (cmd.hasOption("n")) {
56  				namespace = cmd.getOptionValue("n", String.class);
57  			}
58  
59  			DocumentBuilderFactory dbFactory = DocumentBuilderFactory.newInstance();
60  			dbFactory.setValidating(false);
61  			dbFactory.setNamespaceAware(true);
62  			dbFactory.setFeature("http://xml.org/sax/features/namespaces", false);
63  			dbFactory.setFeature("http://xml.org/sax/features/validation", false);
64  			dbFactory.setFeature("http://apache.org/xml/features/nonvalidating/load-dtd-grammar", false);
65  			dbFactory.setFeature("http://apache.org/xml/features/nonvalidating/load-external-dtd", false);
66  
67  			HashSet<String> folders = new HashSet<>();
68  			getFilesRecursive(inputFile, folders);
69  
70  			HashSet<String> okFolders = new HashSet<>();
71  			okLoop:
72  			for (String folder : folders) {
73  				for (String pattern : MMAX_PATTERN) {
74  					StringBuffer newFolder = new StringBuffer();
75  					newFolder.append(folder);
76  					newFolder.append(File.separator);
77  					newFolder.append(pattern);
78  
79  					if (!folders.contains(newFolder.toString())) {
80  						continue okLoop;
81  					}
82  				}
83  
84  				okFolders.add(folder);
85  			}
86  
87  			for (String folder : okFolders) {
88  				LOGGER.info("Entering folder {}", folder);
89  
90  				String baseDataDir = folder + File.separator + MMAX_PATTERN[0];
91  				File nafDir = new File(folder + File.separator + "naf");
92  
93  				if (nafDir.exists()) {
94  					LOGGER.warn("{} dir exists", nafDir.getAbsolutePath());
95  				}
96  				else {
97  					nafDir.mkdir();
98  				}
99  
100 				Iterator<File> fileIterator;
101 				fileIterator = FileUtils.iterateFiles(new File(baseDataDir), new String[]{"xml"}, false);
102 				while (fileIterator.hasNext()) {
103 					File file = fileIterator.next();
104 					StringBuffer stringBuffer = new StringBuffer();
105 					String fileBaseName = FilenameUtils.removeExtension(file.getName());
106 					fileBaseName = fileBaseName.replaceAll(MMAX_SUFFIXES[0], "");
107 
108 					String fileContent = Files.toString(file, Charsets.UTF_8);
109 
110 					// Fix
111 					fileContent = fileContent.replaceAll("&", "&amp;");
112 
113 					DocumentBuilder dBuilder = dbFactory.newDocumentBuilder();
114 					Document doc = dBuilder.parse(new ByteArrayInputStream(fileContent.getBytes(Charsets.UTF_8)));
115 					NodeList nList = doc.getElementsByTagName("word");
116 					for (int temp = 0; temp < nList.getLength(); temp++) {
117 						Element nNode = (Element) nList.item(temp);
118 						stringBuffer.append(nNode.getTextContent().replaceAll("\\s+", ""));
119 						stringBuffer.append(" ");
120 					}
121 
122 					String nafFileName = fileBaseName + ".naf";
123 					File nafFile = new File(nafDir.getAbsolutePath() + File.separator + nafFileName);
124 					String text = stringBuffer.toString().trim();
125 					String documentURI = namespace + nafFileName;
126 
127 					final KAFDocument document = new KAFDocument("en", "v3");
128 					document.setRawText(text);
129 					document.createPublic();
130 					document.getPublic().publicId = Statements.VALUE_FACTORY.createIRI(documentURI).getLocalName();
131 					document.getPublic().uri = documentURI;
132 					document.createFileDesc();
133 					document.getFileDesc().filename = nafFileName;
134 					document.getFileDesc().title = "-";
135 					document.save(nafFile.getAbsolutePath());
136 				}
137 			}
138 
139 		} catch (final Throwable ex) {
140 			CommandLine.fail(ex);
141 		}
142 	}
143 }