1   package eu.fbk.dkm.pikes.naflib;
2   
3   import eu.fbk.utils.core.CommandLine;
4   import ixa.kaflib.LinguisticProcessor;
5   import org.slf4j.LoggerFactory;
6   
7   import java.io.File;
8   import java.text.SimpleDateFormat;
9   import java.util.Date;
10  import java.util.List;
11  import java.util.concurrent.atomic.AtomicLong;
12  
13  /**
14   * Created by alessio on 04/06/15.
15   */
16  
17  public class CorpusStatistics {
18  
19  	private static final org.slf4j.Logger LOGGER = LoggerFactory.getLogger(CorpusStatistics.class);
20  	private static SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd'T'HH:mm:ss.SSSZ");
21  
22  	public static void main(String[] args) {
23  		try {
24  			final CommandLine cmd = CommandLine
25  					.parser()
26  					.withName("statistics")
27  					.withHeader("Calculate statistics on a corpus")
28  					.withOption("i", "input-folder", "the folder of the NAF corpus", "DIR", CommandLine.Type.DIRECTORY_EXISTING, true, false, true)
29  					.withOption("r", "recursive", "parse folder recursively")
30  					.withLogger(LoggerFactory.getLogger("eu.fbk")).parse(args);
31  
32  			File inputFolder = cmd.getOptionValue("input-folder", File.class);
33  			Boolean recursive = cmd.hasOption("recursive");
34  
35  			Corpus corpus = Corpus.create(recursive, inputFolder);
36  
37  			final AtomicLong tokens = new AtomicLong();
38  			final AtomicLong documents = new AtomicLong();
39  			final AtomicLong sentences = new AtomicLong();
40  			final AtomicLong milliseconds = new AtomicLong();
41  
42  			final Object lock = new Object();
43  
44  			corpus.parallelStream().forEach(document -> {
45  				if (document != null) {
46  					tokens.addAndGet(document.getTerms().size());
47  					long numDoc = documents.incrementAndGet();
48  
49  					synchronized (lock) {
50  						System.out.print(".");
51  						if (numDoc % 100 == 0) {
52  							System.out.print(" ");
53  							System.out.print(numDoc);
54  							System.out.println();
55  						}
56  					}
57  
58  					sentences.addAndGet(document.getSentences().size());
59  
60  					Long start = null;
61  					Long end = null;
62  
63  					for (String lpName : document.getLinguisticProcessors().keySet()) {
64  						List<LinguisticProcessor> linguisticProcessors = document.getLinguisticProcessors().get(lpName);
65  						for (LinguisticProcessor linguisticProcessor : linguisticProcessors) {
66  							Date startDate = null, endDate = null;
67  							try {
68  								synchronized (lock) {
69  									startDate = sdf.parse(linguisticProcessor.getBeginTimestamp());
70  									endDate = sdf.parse(linguisticProcessor.getEndTimestamp());
71  								}
72  							} catch (Exception e) {
73  								continue;
74  							}
75  
76  							if (start == null || startDate.getTime() < start) {
77  								start = startDate.getTime();
78  							}
79  							if (end == null || endDate.getTime() > end) {
80  								end = endDate.getTime();
81  							}
82  						}
83  					}
84  
85  					if (start != null && end != null) {
86  						Long diff = end - start;
87  						if (diff > 0 && diff < 1000000) {
88  							milliseconds.addAndGet(diff);
89  						}
90  //						else {
91  //							System.out.println(new Date(start));
92  //							System.out.println(new Date(end));
93  //							System.out.println(document.getPublic().uri);
94  //						}
95  					}
96  				}
97  			});
98  
99  			System.out.println("\n");
100 
101 			LOGGER.info("Documents: {}", documents.get());
102 			LOGGER.info("Sentences: {}", sentences.get());
103 			LOGGER.info("Tokens: {}", tokens.get());
104 			LOGGER.info("Milliseconds: {}", milliseconds.get());
105 			LOGGER.info("Milliseconds per token: {}", (milliseconds.get() * 1.0) / (tokens.get() * 1.0));
106 			LOGGER.info("Milliseconds per sentence: {}", (milliseconds.get() * 1.0) / (sentences.get() * 1.0));
107 			LOGGER.info("Milliseconds per document: {}", (milliseconds.get() * 1.0) / (documents.get() * 1.0));
108 
109 		} catch (final Throwable ex) {
110 			CommandLine.fail(ex);
111 		}
112 	}
113 }