1   package eu.fbk.dkm.pikes.resources.vuaopinion;
2   
3   import eu.fbk.utils.core.CommandLine;
4   import ixa.kaflib.KAFDocument;
5   import ixa.kaflib.WF;
6   import org.apache.commons.io.FileUtils;
7   import org.apache.commons.io.FilenameUtils;
8   import org.slf4j.LoggerFactory;
9   
10  import java.io.File;
11  import java.io.IOException;
12  import java.text.SimpleDateFormat;
13  import java.util.Date;
14  import java.util.Iterator;
15  import java.util.List;
16  import java.util.Locale;
17  
18  /**
19   * Created by alessio on 09/04/15.
20   */
21  
22  public class CorpusPreprocessor {
23  
24  	private static final org.slf4j.Logger LOGGER = LoggerFactory.getLogger(CorpusPreprocessor.class);
25  
26  	static public KAFDocument text2naf(String text) {
27  		KAFDocument doc = new KAFDocument("en", "v3");
28  		doc.setRawText(text);
29  
30  		String date = "";
31  		try {
32  			date = new SimpleDateFormat("yyyy-MM-dd'T'HH:mm:ss", new Locale("en")).format(new Date());
33  		} catch (Exception e) {
34  			LOGGER.error(e.getMessage());
35  		}
36  
37  		KAFDocument.Public p = doc.createPublic();
38  		p.uri = "http://www.example.com";
39  		p.publicId = "0";
40  
41  		KAFDocument.FileDesc d = doc.createFileDesc();
42  		d.creationtime = date;
43  		d.author = "Unknown author";
44  		d.filename = "test.xml";
45  		d.title = "Unknown title";
46  
47  		return doc;
48  	}
49  
50  	public static void main(String[] args) {
51  
52  		try {
53  			CommandLine cmd = null;
54  			cmd = CommandLine
55  					.parser()
56  					.withName("corpus-preprocessor")
57  					.withHeader(
58  							"Convert KAF to NAF")
59  					.withOption("i", "input-path", "the base EN path of the corpus", "DIR",
60  							CommandLine.Type.DIRECTORY_EXISTING, true, false, true)
61  					.withLogger(LoggerFactory.getLogger("eu.fbk.fssa")).parse(args);
62  
63  			final File inputPath = cmd.getOptionValue("i", File.class);
64  			if (!inputPath.exists()) {
65  				throw new IOException(String.format("Folder %s does not exist", inputPath.getAbsolutePath()));
66  			}
67  
68  			File kafPath = new File(inputPath.getAbsolutePath() + File.separator + "kaf");
69  			if (!kafPath.exists()) {
70  				throw new IOException(String.format("Folder %s does not exist", kafPath.getAbsolutePath()));
71  			}
72  			File nafPath = new File(inputPath.getAbsolutePath() + File.separator + "naf");
73  			if (nafPath.exists()) {
74  				throw new IOException(String.format("Folder %s exists", nafPath.getAbsolutePath()));
75  			}
76  			nafPath.mkdir();
77  
78  			Iterator<File> fileIterator;
79  			fileIterator = FileUtils.iterateFiles(kafPath, new String[]{"kaf"}, false);
80  
81  			while (fileIterator.hasNext()) {
82  				File file = fileIterator.next();
83  				String fileBaseName = FilenameUtils.removeExtension(file.getName());
84  				KAFDocument document = KAFDocument.createFromFile(file);
85  
86  				StringBuffer buffer = new StringBuffer();
87  				List<WF> wFs = document.getWFs();
88  				for (WF wf : wFs) {
89  					buffer.append(wf.getForm());
90  					buffer.append(" ");
91  				}
92  				String text = buffer.toString().trim();
93  
94  				KAFDocument doc = text2naf(text);
95  				File nafFile = new File(nafPath.getAbsolutePath() + File.separator + fileBaseName + ".naf");
96  				doc.save(nafFile.getAbsolutePath());
97  //				System.out.println(fileBaseName);
98  			}
99  
100 		} catch (final Throwable ex) {
101 			CommandLine.fail(ex);
102 		}
103 
104 	}
105 }