1   package eu.fbk.dkm.pikes.resources.mpqa;
2   
3   import com.google.common.io.Files;
4   import eu.fbk.rdfpro.util.Statements;
5   import eu.fbk.utils.core.CommandLine;
6   import ixa.kaflib.KAFDocument;
7   import org.slf4j.LoggerFactory;
8   import org.w3c.dom.Document;
9   import org.w3c.dom.Element;
10  import org.w3c.dom.Node;
11  import org.w3c.dom.NodeList;
12  
13  import javax.xml.parsers.DocumentBuilder;
14  import javax.xml.parsers.DocumentBuilderFactory;
15  import javax.xml.stream.XMLStreamException;
16  import java.io.File;
17  import java.io.IOException;
18  import java.util.HashMap;
19  import java.util.LinkedHashMap;
20  
21  /**
22   * Created by alessio on 15/05/15.
23   */
24  
25  public class JohanssonPreprocessor {
26  
27  	private static final org.slf4j.Logger LOGGER = LoggerFactory.getLogger(JohanssonPreprocessor.class);
28  	public static final String DEFAULT_NAMESPACE = "http://eu.fbk.dkm.pikes.resources.mpqa.cs.pitt.edu/corpora/mpqa_corpus/";
29  
30  	private static class Span {
31  		private int start, end, id;
32  		private String value;
33  
34  		public Span(int start, int end, int id, String value) {
35  			this.start = start;
36  			this.end = end;
37  			this.id = id;
38  			this.value = value;
39  		}
40  
41  		public int getStart() {
42  			return start;
43  		}
44  
45  		public int getEnd() {
46  			return end;
47  		}
48  
49  		public int getId() {
50  			return id;
51  		}
52  
53  		public String getValue() {
54  			return value;
55  		}
56  
57  		@Override
58  		public String toString() {
59  			return "Span{" +
60  					"start=" + start +
61  					", end=" + end +
62  					", id=" + id +
63  					", value='" + value + '\'' +
64  					'}';
65  		}
66  	}
67  
68  	public static void main(final String[] args) throws IOException, XMLStreamException {
69  		try {
70  			final CommandLine cmd = CommandLine
71  					.parser()
72  					.withName("corpus-preprocessor")
73  					.withHeader(
74  							"Produces NAF files starting from the MPQA v.2 corpus preprocessed by Johansson/Moschitti.")
75  					.withOption("i", "input-path", "the base path of the Johansson MPQA corpus", "DIR",
76  							CommandLine.Type.DIRECTORY_EXISTING, true, false, true)
77  					.withOption("o", "output",
78  							"the output path where to save produced files",
79  							"DIR", CommandLine.Type.DIRECTORY_EXISTING, true, false, true)
80  					.withOption("n", "namespace",
81  							String.format("the namespace for generating document URIs, default %s", DEFAULT_NAMESPACE),
82  							"NS", CommandLine.Type.STRING, true, false, false)
83  					.withOption("doc", "doc", "Check only one document", "URL", CommandLine.Type.STRING, true, false, false)
84  					.withLogger(LoggerFactory.getLogger("eu.fbk.fssa")).parse(args);
85  
86  			final File inputPath = cmd.getOptionValue("i", File.class);
87  
88  			final File outputPath = cmd.getOptionValue("o", File.class);
89  			if (!outputPath.exists()) {
90  				outputPath.mkdirs();
91  			}
92  
93  			String namespace = DEFAULT_NAMESPACE;
94  			if (cmd.hasOption("n")) {
95  				namespace = cmd.getOptionValue("n", String.class);
96  			}
97  
98  			String checkOneDoc = cmd.getOptionValue("doc", String.class);
99  
100 			DocumentBuilderFactory dbFactory = DocumentBuilderFactory.newInstance();
101 			DocumentBuilder dBuilder = dbFactory.newDocumentBuilder();
102 
103 			for (File f : Files.fileTreeTraverser().preOrderTraversal(inputPath)) {
104 
105 				// Only consider tokens
106 				if (!f.getAbsolutePath().endsWith("tokens.xml")) {
107 					continue;
108 				}
109 
110 				final String name = f.getName().replace('/', '_');
111 				final String documentURI = namespace + name;
112 
113 				if (checkOneDoc != null && !checkOneDoc.equals(documentURI)) {
114 					continue;
115 				}
116 
117 				Document doc = dBuilder.parse(f);
118 
119 				NodeList nList = doc.getElementsByTagName("annotation");
120 
121 				HashMap<String, LinkedHashMap<Integer, Span>> spans = new HashMap<>();
122 
123 				for (int temp = 0; temp < nList.getLength(); temp++) {
124 					Node nNode = nList.item(temp);
125 					if (nNode.getNodeType() != Node.ELEMENT_NODE) {
126 						continue;
127 					}
128 
129 					Element eElement = (Element) nNode;
130 
131 					String provides = eElement.getAttribute("provides");
132 					if (spans.get(provides) == null) {
133 						spans.put(provides, new LinkedHashMap<>());
134 					}
135 
136 					NodeList eS = eElement.getElementsByTagName("e");
137 					for (int spanID = 0; spanID < eS.getLength(); spanID++) {
138 						Node span = eS.item(spanID);
139 						if (span.getNodeType() != Node.ELEMENT_NODE) {
140 							continue;
141 						}
142 
143 						Element eSpan = (Element) span;
144 
145 						Integer id = Integer.parseInt(eSpan.getAttribute("id"));
146 						Integer start = Integer.parseInt(eSpan.getAttribute("start").replaceAll("#", ""));
147 						Integer end = Integer.parseInt(eSpan.getAttribute("end").replaceAll("#", ""));
148 						String value = eSpan.getTextContent();
149 
150 						Span s = new Span(start, end, id, value);
151 						spans.get(provides).put(s.id, s);
152 					}
153 				}
154 
155 				StringBuffer buffer = new StringBuffer();
156 
157 				Integer lastToken = 0;
158 				for (Span span : spans.get("SENTENCES").values()) {
159 					if (span.start != lastToken + 1) {
160 						LOGGER.warn("Missing sentence [{}/{}]", f.getName(), span.start);
161 						for (int i = lastToken + 1; i < span.start; i++) {
162 							String token = spans.get("TOKENS").get(i).getValue();
163 							token = token.replace(' ', '_');
164 							token = token.replace('<', '_');
165 							token = token.replace('>', '_');
166 							buffer.append(token).append(" ");
167 						}
168 						buffer.append("\n");
169 					}
170 					for (int i = span.start; i <= span.end; i++) {
171 						String token = spans.get("TOKENS").get(i).getValue();
172 						token = token.replace(' ', '_');
173 						token = token.replace('<', '_');
174 						token = token.replace('>', '_');
175 						buffer.append(token).append(" ");
176 					}
177 					lastToken = span.end;
178 					buffer.append("\n");
179 				}
180 
181 				String text = buffer.toString();
182 
183 				File nafFile = new File(outputPath.getAbsolutePath() + File.separator + name);
184 
185 				final KAFDocument document = new KAFDocument("en", "v3");
186 
187 				document.setRawText(text);
188 
189 				document.createPublic();
190 				document.getPublic().publicId = Statements.VALUE_FACTORY.createIRI(documentURI).getLocalName();
191 				document.getPublic().uri = documentURI;
192 
193 				document.createFileDesc();
194 //				document.getFileDesc().author = source + " / " + description;
195 //				document.getFileDesc().creationtime = createTime;
196 //				document.getFileDesc().filename = mediaFile;
197 //				document.getFileDesc().filetype = mediaType;
198 //				document.getFileDesc().title = title + " (" + topic + " / " + country + ")";
199 
200 				document.save(nafFile.getAbsolutePath());
201 //				System.out.println(text);
202 			}
203 
204 
205 		} catch (final Throwable ex) {
206 			CommandLine.fail(ex);
207 		}
208 	}
209 
210 }