1
2 package eu.fbk.dkm.pikes.resources.reader;
3
4 import org.xml.sax.*;
5 import org.xml.sax.helpers.DefaultHandler;
6 import se.lth.cs.nlp.nlputils.core.Ax;
7 import se.lth.cs.nlp.nlputils.core.ListMap;
8
9 import javax.xml.parsers.ParserConfigurationException;
10 import java.io.*;
11 import java.util.*;
12 import java.util.regex.Matcher;
13 import java.util.regex.Pattern;
14
15
16
17 public class LKCollectionReader {
18
19 private ArrayList<File> textFiles = new ArrayList();
20 private int nextFile = -1;
21
22 private HashMap<String, ArrayList<String>> annFileNames = new HashMap();
23
24 private static final Pattern BASE_PAT = Pattern.compile("name=\"base\">(.*)</tag>");
25
26 private static final Pattern ON_FILE_PAT = Pattern.compile("scope=\"(.*?)\"");
27 private static final Pattern ON_FILES_PAT = Pattern.compile("on-files=\"(.*?)\"");
28
29 private String workDir = null;
30
31 private HashSet<String> usedAnnotations;
32
33 public LKCollectionReader(String dir) throws IOException {
34 this(null, dir, null);
35 }
36
37 public LKCollectionReader(String dir, List<String> fileList) throws IOException {
38 this(null, dir, fileList);
39 }
40
41 private LKCollectionReader(Collection<String> usedAnnotations, String dir,
42 List<String> fileList) throws IOException {
43 workDir = dir;
44 if (usedAnnotations != null) {
45 this.usedAnnotations = new HashSet(usedAnnotations);
46 }
47
48 if (fileList == null) {
49 makeFileList(dir);
50 }
51 else {
52 makeFileList(dir, fileList);
53 }
54
55 nextFile = 0;
56
57
58
59
60 }
61
62 private void makeFileList(String dir) throws IOException {
63 File df = new File(dir);
64 if (!df.isDirectory()) {
65 throw new IllegalArgumentException("Must specify a directory");
66 }
67
68 File[] list = df.listFiles();
69 for (File f : list) {
70
71
72 BufferedReader br = Ax.openFileReader(f.getAbsolutePath());
73 String line = br.readLine();
74
75 int count = 0;
76 while (count < 3 && line != null) {
77
78 count++;
79 if (line.contains("<lk-text")) {
80 textFiles.add(f);
81
82 break;
83 }
84 if (line.contains("<lk-annotation")) {
85 String base = null;
86 while (line != null) {
87
88 Matcher m = BASE_PAT.matcher(line);
89 if (m.find()) {
90 base = m.group(1);
91
92 ArrayList<String> afns = annFileNames.get(base);
93 if (afns == null) {
94 afns = new ArrayList();
95 annFileNames.put(base, afns);
96
97 }
98 afns.add(f.getName());
99 break;
100 }
101 line = br.readLine();
102 }
103 break;
104 }
105 line = br.readLine();
106 }
107
108 br.close();
109 }
110 }
111
112 private void makeFileList(String dir, List<String> listedFiles) throws IOException {
113 ArrayList<String> copy = new ArrayList(listedFiles);
114 File df = new File(dir);
115 if (!df.isDirectory()) {
116 throw new IllegalArgumentException("Must specify a directory");
117 }
118
119 for (ListIterator<String> iter = copy.listIterator(); iter.hasNext(); ) {
120 String fn = iter.next();
121
122
123
124 if (!fn.endsWith(".xml"))
125 {
126 fn = fn + ".lktext.xml";
127 }
128 File f = new File(dir + File.separator + fn);
129 if (!f.exists() && fn.startsWith(dir)) {
130 fn = fn.substring(dir.length());
131 f = new File(dir + File.separator + fn);
132 }
133 if (!f.exists()) {
134 String fn2 = fn.replaceAll("/", "_");
135 f = new File(dir + File.separator + fn2);
136 if (!f.exists()) {
137 throw new IllegalArgumentException("file " + fn + " does not exist");
138 }
139 fn = fn2;
140 }
141 iter.set(fn);
142 textFiles.add(f);
143 }
144
145
146 File[] list = df.listFiles();
147 for (File f : list) {
148 Scanner sc = new Scanner(f);
149 int count = 0;
150 while (count < 3 && sc.hasNextLine()) {
151 String line = sc.nextLine();
152 count++;
153 if (line.contains("<lk-text")) {
154
155 break;
156 }
157 if (line.contains("<lk-annotation")) {
158 String base = null;
159 while (sc.hasNextLine()) {
160 String line2 = sc.nextLine();
161 Matcher m = BASE_PAT.matcher(line2);
162 if (m.find()) {
163 base = m.group(1);
164 if (copy.contains(base)) {
165 ArrayList<String> afns = annFileNames.get(base);
166 if (afns == null) {
167 afns = new ArrayList();
168 annFileNames.put(base, afns);
169 }
170 afns.add(f.getName());
171 }
172 break;
173 }
174 }
175 break;
176 }
177 }
178 sc.close();
179 }
180 }
181
182 public boolean hasNext() {
183 return (nextFile >= 0 && nextFile < textFiles.size());
184
185 }
186
187 public LKAnnotatedText next() {
188
189 try {
190
191
192
193
194 File file = (File) textFiles.get(nextFile);
195
196
197
198 LKTextParserCallback tcb = null;
199
200 XMLReader reader = makeXMLReader();
201 tcb = new LKTextParserCallback();
202 reader.setContentHandler(tcb);
203
204 InputSource is = new InputSource(new FileInputStream(file));
205
206 reader.parse(is);
207
208 String text = tcb.getText();
209
210
211
212
213 ArrayList<LKAnnotationLayer> layers
214 = readAnnotations(file.getName(),
215 annFileNames.get(file.getName()));
216
217 nextFile++;
218
219 LKAnnotatedText out = new LKAnnotatedText();
220 out.rawText = text;
221 out.layers = layers;
222 out.metaInfo = tcb.metaInfo;
223 return out;
224
225 } catch (Exception e) {
226 throw new RuntimeException(e);
227 }
228
229 }
230
231 LKAnnotationLayer getLayer(String provides, ArrayList<LKAnnotationLayer> ls) {
232 for (LKAnnotationLayer l : ls) {
233 if (l.provides.equals(provides)) {
234 return l;
235 }
236 }
237 return null;
238 }
239
240
241
242 private XMLReader makeXMLReader()
243 throws SAXException, ParserConfigurationException {
244
245 javax.xml.parsers.SAXParserFactory saxParserFactory =
246 javax.xml.parsers.SAXParserFactory.newInstance();
247
248 final javax.xml.parsers.SAXParser saxParser
249 = saxParserFactory.newSAXParser();
250
251 final XMLReader parser = saxParser.getXMLReader();
252
253 return parser;
254 }
255
256 private static class LKTextParserCallback extends DefaultHandler
257 implements ContentHandler {
258
259 private boolean insideText = false;
260 private StringBuilder sb = new StringBuilder();
261 private HashMap<String, String> metaInfo = new HashMap();
262
263 private boolean insideTag = false;
264 private StringBuilder tag = null;
265 private String currentTagName = null;
266
267 public void startElement(String namespace, String localname,
268 String type, Attributes attributes) {
269 if (type.equals("tag")) {
270 String name = attributes.getValue("name");
271 if (name == null) {
272 throw new RuntimeException("no name for tag");
273 }
274 currentTagName = name;
275 tag = new StringBuilder();
276 insideTag = true;
277 }
278 else if (type.equals("text")) {
279 insideText = true;
280
281 }
282 else if (type.matches("lk-text|meta-info")) {
283
284 }
285 else {
286 throw new RuntimeException("illegal type: " + type);
287 }
288
289 }
290
291 public void endElement(String namespace, String localname,
292 String type) {
293 if (type.equals("text")) {
294 insideText = false;
295 }
296 else if (type.equals("tag")) {
297 insideTag = false;
298 metaInfo.put(currentTagName, tag.toString());
299 }
300 }
301
302 public void characters(char[] ch, int start, int len) {
303 if (insideText) {
304 String s = new String(ch, start, len);
305 sb.append(s);
306 }
307 else if (insideTag) {
308 String s = new String(ch, start, len);
309 tag.append(s);
310 }
311 }
312
313 String getText() {
314 return sb.toString();
315 }
316
317 HashMap<String, String> getMetaInfo() {
318 return metaInfo;
319 }
320
321 }
322
323 private ArrayList<LKAnnotationLayer> readAnnotations(String baseText, ArrayList<String> files) throws IOException {
324 if (files == null) {
325 throw new RuntimeException("file list = null");
326 }
327
328 ListMap<String, LKAnnotationLayer> layerMap = new ListMap();
329
330 ArrayList<String> sorted = sortAnnFiles(files);
331
332 ArrayList<LKAnnotationLayer> out = new ArrayList();
333
334 if (sorted.get(0).equals(baseText)) {
335 sorted.remove(0);
336 }
337
338 for (String file : sorted) {
339 try {
340
341 XMLReader reader = makeXMLReader();
342 LKAnnotationParserCallback acb
343 = new LKAnnotationParserCallback(baseText, layerMap);
344 reader.setContentHandler(acb);
345 String fullFileName = workDir + File.separatorChar + file;
346 InputSource is = new InputSource(fullFileName);
347 reader.parse(is);
348 layerMap.putAll(file, acb.getLayers());
349 out.addAll(acb.getLayers());
350 } catch (SAXException e) {
351 throw new IOException(e);
352 } catch (ParserConfigurationException e) {
353 throw new IOException(e);
354 }
355 }
356
357 return out;
358 }
359
360 private static class LKAnnotationParserCallback extends DefaultHandler
361 implements ContentHandler {
362
363 private String baseTextFile;
364
365 private ListMap<String, LKAnnotationLayer> layerMap;
366 private ArrayList<LKAnnotationLayer> layers = new ArrayList();
367 private LKAnnotationLayer current;
368
369 private boolean onTextFile = false;
370
371 private ArrayList<LKAnnotationLayer> currentScope = null;
372
373 private boolean insideTag = false;
374 private boolean insideEntity = false;
375 private StringBuilder tag = null;
376 private String currentTagName = null;
377
378 private HashMap<String, String> metaInfo = new HashMap();
379
380 private LinkedList<DataElementNode> stack;
381
382 LKAnnotationParserCallback(String baseTextFile,
383 ListMap<String, LKAnnotationLayer> layerMap) {
384 this.baseTextFile = baseTextFile;
385 this.layerMap = layerMap;
386 }
387
388 public void startElement(String namespace, String localname,
389 String type, Attributes attributes) {
390
391 if (insideEntity) {
392 DataElementNode parent = stack.getLast();
393 DataElementNode n = new DataElementNode(type);
394 for (int i = attributes.getLength() - 1; i >= 0; i--) {
395 String k = attributes.getQName(i);
396 String v = attributes.getValue(i);
397 n.attributes.put(k, v);
398 }
399 parent.children.add(n);
400 stack.add(n);
401 }
402 else if (type.equals("e")) {
403 LKAnnotationEntity e = new LKAnnotationEntity();
404
405 String on = attributes.getValue("on");
406 String start = attributes.getValue("start");
407 String end = attributes.getValue("end");
408 String from = attributes.getValue("from");
409 String to = attributes.getValue("to");
410
411 if (on != null) {
412 if (start != null) {
413 throw new RuntimeException("on!=null => start=null");
414 }
415 if (end != null) {
416 throw new RuntimeException("on!=null => end=null");
417 }
418 if (from != null) {
419 throw new RuntimeException("on!=null => from=null");
420 }
421 if (to != null) {
422 throw new RuntimeException("on!=null => to=null");
423 }
424 }
425
426 if (from != null || to != null) {
427 if (from == null) {
428 throw new RuntimeException("to!=null => from!=null");
429 }
430 if (to == null) {
431 throw new RuntimeException("from!=null => to!=null");
432 }
433 if (end != null) {
434 throw new RuntimeException("from!=null => end=null");
435 }
436 if (start != null) {
437 throw new RuntimeException("from!=null => start=null");
438 }
439 }
440 if (start != null || end != null) {
441 if (end == null) {
442 throw new RuntimeException("start!=null => end!=null");
443 }
444 if (start == null) {
445 throw new RuntimeException("end!=null => start!=null");
446 }
447 }
448
449 if (onTextFile && start != null) {
450 if (!start.startsWith("#")) {
451 throw new RuntimeException("start must begin with #");
452 }
453 if (!end.startsWith("#")) {
454 throw new RuntimeException("start must begin with #");
455 }
456 e.cstart = Integer.parseInt(start.substring(1));
457 e.cend = Integer.parseInt(end.substring(1)) + 1;
458 }
459 else if (start != null) {
460 LKAnnotationLayer[] l1 = new LKAnnotationLayer[1];
461 int ix1 = dereferenceId(start, l1);
462 LKAnnotationLayer[] l2 = new LKAnnotationLayer[1];
463 int ix2 = dereferenceId(end, l2);
464 if (l1[0] != l2[0]) {
465 throw new RuntimeException("different layers in start-end");
466 }
467 e.referred = new ArrayList();
468 for (int i = ix1; i <= ix2; i++) {
469 e.referred.add(l1[0].entityList.get(i));
470 }
471 }
472 else if (on != null) {
473 String[] set = on.split("\\,\\s*");
474 e.referred = new ArrayList();
475 LKAnnotationLayer[] l = new LKAnnotationLayer[1];
476 for (String s : set) {
477 int ix = dereferenceId(s, l);
478 e.referred.add(l[0].entityList.get(ix));
479 }
480 }
481 else if (from != null) {
482 LKAnnotationLayer[] l = new LKAnnotationLayer[1];
483 int ix1 = dereferenceId(from, l);
484 e.from = l[0].entityList.get(ix1);
485 int ix2 = dereferenceId(to, l);
486 e.to = l[0].entityList.get(ix2);
487 }
488
489 String id = attributes.getValue("id");
490 if (id == null) {
491 throw new RuntimeException("no id");
492 }
493 if (current.idToIndex.containsKey(id)) {
494 throw new RuntimeException("id must be unique");
495 }
496
497 e.localURI = id;
498
499
500
501 current.idToIndex.put(id, current.entityList.size());
502 current.entityList.add(e);
503
504 insideEntity = true;
505 stack = new LinkedList();
506 DataElementNode n = new DataElementNode("__ROOT__");
507 stack.add(n);
508 e.data = n;
509
510 }
511 else if (type.equals("tag")) {
512 String name = attributes.getValue("name");
513 if (name == null) {
514 throw new RuntimeException("no name for tag");
515 }
516 currentTagName = name;
517 tag = new StringBuilder();
518 insideTag = true;
519 }
520 else if (type.equals("annotation")) {
521 current = new LKAnnotationLayer();
522 String scopeFile = attributes.getValue("scope");
523 String onFiles = attributes.getValue("on-files");
524 if (onFiles != null) {
525 throw new RuntimeException("on-files is unimplemented: currently, we can only handle annotation layers with scope");
526 }
527
528 if (scopeFile != null && !scopeFile.equals("")) {
529 if (scopeFile.contains("lktext")) {
530 currentScope = null;
531 }
532 else {
533 currentScope = layerMap.get(scopeFile);
534 if (currentScope == null) {
535 throw new RuntimeException("scope not found: |" + scopeFile + "|");
536 }
537 }
538 }
539 else {
540 currentScope = layers;
541 }
542
543
544 onTextFile = scopeFile != null && scopeFile.equals(baseTextFile);
545 current.scopeFile = scopeFile;
546 current.provides = attributes.getValue("provides");
547 layers.add(current);
548 }
549 else if (type.matches("lk-annotation|meta-info")) {
550
551 }
552 else {
553 throw new RuntimeException("illegal type: " + type);
554 }
555 }
556
557 private int dereferenceId(String ref, LKAnnotationLayer[] lout) {
558 if (ref == null) {
559 throw new IllegalArgumentException("null reference");
560 }
561 ref = ref.trim();
562 int ix = ref.indexOf('#');
563 LKAnnotationLayer l = null;
564 String fileRef = null;
565 String idRef = null;
566 if (ix == -1) {
567
568 throw new RuntimeException("No fragment identifier");
569 }
570 else {
571 fileRef = ref.substring(0, ix);
572 idRef = ref.substring(ix + 1);
573 }
574
575 ArrayList<LKAnnotationLayer> scope;
576 if (fileRef == null || fileRef.equals("")) {
577 scope = currentScope;
578 }
579
580
581 else {
582
583 scope = layerMap.get(fileRef);
584 if (scope == null) {
585 throw new RuntimeException("scope not found: " + fileRef);
586 }
587 }
588 for (LKAnnotationLayer ll : scope) {
589 Integer llIx = ll.idToIndex.get(idRef);
590 if (llIx != null) {
591 lout[0] = ll;
592 return llIx;
593 }
594 }
595 throw new RuntimeException("entity " + idRef + " not found");
596 }
597
598 public void endElement(String namespace, String localname,
599 String type) {
600
601 if (insideEntity) {
602 if (stack.size() == 1) {
603 insideEntity = false;
604
605 }
606 else {
607 stack.removeLast();
608 }
609 }
610 else if (type.equals("tag")) {
611 insideTag = false;
612 metaInfo.put(currentTagName, tag.toString());
613 }
614 else if (type.equals("annotation")) {
615 current = null;
616 }
617 }
618
619 public void characters(char[] ch, int start, int len) {
620 if (insideEntity) {
621 String s = new String(ch, start, len);
622 DataTextNode n = new DataTextNode(s);
623 stack.getLast().children.add(n);
624 }
625 else if (insideTag) {
626 String s = new String(ch, start, len);
627 tag.append(s);
628 }
629 }
630
631 ArrayList<LKAnnotationLayer> getLayers() {
632 return layers;
633 }
634
635 HashMap<String, String> getMetaInfo() {
636 return metaInfo;
637 }
638
639 }
640
641 private ArrayList<String> sortAnnFiles(ArrayList<String> files) throws IOException {
642
643 ListMap<String, String> dg = createAnnDepGraph(files);
644
645 ArrayList<String> out = tsort(dg);
646
647 return out;
648 }
649
650 private <T> ArrayList<T> tsort(ListMap<T, T> depGraph) {
651
652 HashSet<T> starts = new HashSet<T>();
653 for (T k : depGraph.keySet()) {
654 starts.add(k);
655 }
656
657 for (T k : depGraph.keySet()) {
658 starts.removeAll(depGraph.get(k));
659 }
660
661 if (starts.size() == 0) {
662 throw new RuntimeException("cyclic or empty graph!");
663 }
664
665 LinkedList<T> q = new LinkedList(starts);
666 ArrayList<T> out = new ArrayList();
667
668 while (!q.isEmpty()) {
669 T t = q.removeFirst();
670 out.add(t);
671 ArrayList<T> sl = depGraph.get(t);
672 if (sl != null) {
673 starts.clear();
674 starts.addAll(sl);
675 for (T s : sl) {
676 ArrayList<T> sl2 = depGraph.get(s);
677 if (sl2 != null) {
678 starts.removeAll(sl2);
679 }
680 }
681 q.addAll(starts);
682 }
683 }
684 HashSet<T> seen = new HashSet();
685 for (Iterator<T> it = out.iterator(); it.hasNext(); ) {
686 T t = it.next();
687 if (seen.contains(t)) {
688 it.remove();
689 }
690 else {
691 seen.add(t);
692 }
693 }
694 return out;
695 }
696
697 private <T> ArrayList<T> tsort_orig(ListMap<T, T> depGraph) {
698 HashSet<T> starts = new HashSet<T>();
699 for (T k : depGraph.keySet()) {
700 starts.add(k);
701 }
702
703 for (T k : depGraph.keySet()) {
704 starts.removeAll(depGraph.get(k));
705 }
706
707 if (starts.size() == 0) {
708 throw new RuntimeException("cyclic or empty graph!");
709 }
710
711 LinkedList<T> q = new LinkedList(starts);
712 ArrayList<T> out = new ArrayList();
713
714 while (!q.isEmpty()) {
715 T t = q.removeFirst();
716 out.add(t);
717 ArrayList<T> sl = depGraph.get(t);
718
719 if (sl != null) {
720 q.addAll(sl);
721 }
722 }
723 HashSet<T> seen = new HashSet();
724 for (Iterator<T> it = out.iterator(); it.hasNext(); ) {
725 T t = it.next();
726 if (seen.contains(t)) {
727 it.remove();
728 }
729 else {
730 seen.add(t);
731 }
732 }
733
734 return out;
735 }
736
737 private ListMap<String, String> createAnnDepGraph(ArrayList<String> files) throws IOException {
738 ListMap<String, String> out = new ListMap();
739 for (String fn : files) {
740 String full = workDir + File.separatorChar + fn;
741 BufferedReader br = new BufferedReader(new FileReader(full));
742 String line = br.readLine();
743 while (line != null) {
744 line = line.trim();
745 if (!line.startsWith("<e")) {
746 Matcher m1 = ON_FILE_PAT.matcher(line);
747 if (m1.find()) {
748 String ref = m1.group(1);
749 out.put(ref, fn);
750 }
751 else {
752 Matcher m2 = ON_FILES_PAT.matcher(line);
753 if (m2.find()) {
754 throw new RuntimeException("on-files is unimplemented...");
755 }
756 }
757 }
758
759 line = br.readLine();
760 }
761 br.close();
762 }
763 return out;
764 }
765
766 public static void main(String[] argv) {
767 try {
768 LKCollectionReader r = new LKCollectionReader(argv[0]);
769 int i = 0;
770 while (r.hasNext()) {
771 i++;
772 LKAnnotatedText annotatedText = r.next();
773 LKAnnotationLayer layer = annotatedText.getLayer("MPQA-expressive-subjectivity");
774 System.out.println(layer.onFiles);
775 System.out.println(layer.scopeFile);
776 for (LKAnnotationEntity entity : layer.entityList) {
777 System.out.println(entity);
778 }
779 break;
780
781
782
783 }
784 } catch (Exception e) {
785 e.printStackTrace();
786 }
787 }
788
789 }