1 package eu.fbk.dkm.pikes.naflib;
2
3 import eu.fbk.utils.core.CommandLine;
4 import ixa.kaflib.LinguisticProcessor;
5 import org.slf4j.LoggerFactory;
6
7 import java.io.File;
8 import java.text.SimpleDateFormat;
9 import java.util.Date;
10 import java.util.List;
11 import java.util.concurrent.atomic.AtomicLong;
12
13
14
15
16
17 public class CorpusStatistics {
18
19 private static final org.slf4j.Logger LOGGER = LoggerFactory.getLogger(CorpusStatistics.class);
20 private static SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd'T'HH:mm:ss.SSSZ");
21
22 public static void main(String[] args) {
23 try {
24 final CommandLine cmd = CommandLine
25 .parser()
26 .withName("statistics")
27 .withHeader("Calculate statistics on a corpus")
28 .withOption("i", "input-folder", "the folder of the NAF corpus", "DIR", CommandLine.Type.DIRECTORY_EXISTING, true, false, true)
29 .withOption("r", "recursive", "parse folder recursively")
30 .withLogger(LoggerFactory.getLogger("eu.fbk")).parse(args);
31
32 File inputFolder = cmd.getOptionValue("input-folder", File.class);
33 Boolean recursive = cmd.hasOption("recursive");
34
35 Corpus corpus = Corpus.create(recursive, inputFolder);
36
37 final AtomicLong tokens = new AtomicLong();
38 final AtomicLong documents = new AtomicLong();
39 final AtomicLong sentences = new AtomicLong();
40 final AtomicLong milliseconds = new AtomicLong();
41
42 final Object lock = new Object();
43
44 corpus.parallelStream().forEach(document -> {
45 if (document != null) {
46 tokens.addAndGet(document.getTerms().size());
47 long numDoc = documents.incrementAndGet();
48
49 synchronized (lock) {
50 System.out.print(".");
51 if (numDoc % 100 == 0) {
52 System.out.print(" ");
53 System.out.print(numDoc);
54 System.out.println();
55 }
56 }
57
58 sentences.addAndGet(document.getSentences().size());
59
60 Long start = null;
61 Long end = null;
62
63 for (String lpName : document.getLinguisticProcessors().keySet()) {
64 List<LinguisticProcessor> linguisticProcessors = document.getLinguisticProcessors().get(lpName);
65 for (LinguisticProcessor linguisticProcessor : linguisticProcessors) {
66 Date startDate = null, endDate = null;
67 try {
68 synchronized (lock) {
69 startDate = sdf.parse(linguisticProcessor.getBeginTimestamp());
70 endDate = sdf.parse(linguisticProcessor.getEndTimestamp());
71 }
72 } catch (Exception e) {
73 continue;
74 }
75
76 if (start == null || startDate.getTime() < start) {
77 start = startDate.getTime();
78 }
79 if (end == null || endDate.getTime() > end) {
80 end = endDate.getTime();
81 }
82 }
83 }
84
85 if (start != null && end != null) {
86 Long diff = end - start;
87 if (diff > 0 && diff < 1000000) {
88 milliseconds.addAndGet(diff);
89 }
90
91
92
93
94
95 }
96 }
97 });
98
99 System.out.println("\n");
100
101 LOGGER.info("Documents: {}", documents.get());
102 LOGGER.info("Sentences: {}", sentences.get());
103 LOGGER.info("Tokens: {}", tokens.get());
104 LOGGER.info("Milliseconds: {}", milliseconds.get());
105 LOGGER.info("Milliseconds per token: {}", (milliseconds.get() * 1.0) / (tokens.get() * 1.0));
106 LOGGER.info("Milliseconds per sentence: {}", (milliseconds.get() * 1.0) / (sentences.get() * 1.0));
107 LOGGER.info("Milliseconds per document: {}", (milliseconds.get() * 1.0) / (documents.get() * 1.0));
108
109 } catch (final Throwable ex) {
110 CommandLine.fail(ex);
111 }
112 }
113 }