1 package eu.fbk.dkm.pikes.resources.enronEmailDataset;
2
3 import com.google.common.io.Files;
4 import eu.fbk.dkm.pikes.resources.ecb.ConvertECBPlus;
5 import eu.fbk.utils.core.CommandLine;
6 import eu.fbk.utils.core.IO;
7 import ixa.kaflib.KAFDocument;
8 import org.apache.commons.lang.time.DateUtils;
9 import org.apache.commons.mail.util.MimeMessageParser;
10 import org.apache.james.mime4j.codec.DecodeMonitor;
11 import org.apache.james.mime4j.message.DefaultBodyDescriptorBuilder;
12 import org.apache.james.mime4j.parser.ContentHandler;
13 import org.apache.james.mime4j.parser.MimeStreamParser;
14 import org.apache.james.mime4j.stream.BodyDescriptorBuilder;
15 import org.apache.james.mime4j.stream.MimeConfig;
16 import org.slf4j.Logger;
17 import org.slf4j.LoggerFactory;
18
19
20 import javax.mail.Header;
21 import javax.mail.Session;
22 import javax.mail.internet.MimeMessage;
23 import java.io.*;
24 import java.nio.charset.StandardCharsets;
25 import java.text.ParseException;
26 import java.text.SimpleDateFormat;
27 import java.util.Date;
28 import java.util.Enumeration;
29 import java.util.Properties;
30 import java.util.regex.Matcher;
31 import java.util.regex.Pattern;
32
33
34
35
36 public class Email2NAF {
37 private static final Logger LOGGER = LoggerFactory.getLogger(Email2NAF.class);
38 private static Pattern folderPattern = Pattern.compile("^([0-9]+)");
39 private static SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd'T'HH:mm:ss.SSSZ");
40 private static String[] parsePatterns = {
41 "E, dd-MMM-yy HH:mm:ss z",
42 "E, dd MMM yy HH:mm:ss z",
43 "E, dd MMM yyyy HH:mm:ss z",
44 "E MMM dd HH:mm:ss yyyy",
45 "E, dd-MMM-yy HH:mm:ss yyyy",
46 ", dd MMM yyyy HH:mm:ss z",
47 "MMM dd, yyyy",
48 "E, dd-MMM-yy z",
49 "E, dd MMM yy z",
50
51
52 "E, dd-MMM-yy HH z",
53
54
55 "E MMM dd HH:mm:ss z yy",
56 "E, dd MMM yyyy HH:mm:ss Z (z)"
57
58 };
59 private static String xml11pattern = "[^"
60 + "\u0001-\uD7FF"
61 + "\uE000-\uFFFD"
62 + "\ud800\udc00-\udbff\udfff"
63 + "]+";
64 private static String xml10pattern = "[^"
65 + "\u0009\r\n"
66 + "\u0020-\uD7FF"
67 + "\uE000-\uFFFD"
68 + "\ud800\udc00-\udbff\udfff"
69 + "]";
70
71 public static void main(String[] args) {
72 final CommandLine cmd = CommandLine
73 .parser()
74 .withName("convert-enron")
75 .withHeader("Convert Enron Email Dataset files to NAF")
76 .withOption("i", "input-path", "the base path of the corpus", "DIR",
77 CommandLine.Type.DIRECTORY_EXISTING, true, false, true)
78 .withOption("o", "output-path", "output NAF folder", "DIR",
79 CommandLine.Type.DIRECTORY, true, false, true)
80 .withLogger(LoggerFactory.getLogger("eu.fbk")).parse(args);
81
82 final File inputPath = cmd.getOptionValue("i", File.class);
83 final File outputPath = cmd.getOptionValue("o", File.class);
84
85 boolean opMkDirs = outputPath.mkdirs();
86 if (!opMkDirs) {
87 LOGGER.error("Unable to create folder {}", outputPath.getAbsolutePath());
88 }
89
90
91 for (final File file : Files.fileTreeTraverser().preOrderTraversal(inputPath)) {
92 if (!file.isFile()) {
93 continue;
94 }
95 if (file.getName().startsWith(".")) {
96 continue;
97 }
98
99
100
101 String relativeFilePath = file.getAbsolutePath().substring(inputPath.getAbsolutePath().length());
102 if (relativeFilePath.startsWith(File.separator)) {
103 relativeFilePath = relativeFilePath.substring(1);
104 }
105
106 try {
107 System.out.println();
108 System.out.print("Starting processing of file: "+file.getAbsolutePath());
109
110 KAFDocument document = new KAFDocument("en", "FBK");
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136 Session s = Session.getDefaultInstance(new Properties());
137 String raw = readFile(file.getAbsolutePath());
138 InputStream is = new ByteArrayInputStream(raw.getBytes());
139
140
141
142 MimeMessage message = new MimeMessage(s, is);
143
144 message.getAllHeaderLines();
145 String from = "";
146 for (Enumeration<Header> e = message.getAllHeaders(); e.hasMoreElements();) {
147 Header h = e.nextElement();
148 if (h.getName().contains("X-From")) {
149
150 from = h.getValue().replaceAll(xml10pattern,"");;
151 }
152 }
153
154
155 MimeMessageParser parser = new MimeMessageParser(message);
156
157
158
159 String message_id = parser.getMimeMessage().getMessageID();
160 String date = parser.getMimeMessage().getSentDate().toString();
161
162
163
164
165
166
167
168
169 String content = (String) parser.getMimeMessage().getContent();
170
171
172
173
174 content = content.replaceAll(xml10pattern,"");
175
176
177
178
179
180
181
182
183
184
185 Date thisDate = null;
186 try {
187 thisDate = DateUtils.parseDate(date, parsePatterns);
188
189 } catch (ParseException e) {
190 System.out.println("DATE PROBLEM!!!");
191 }
192
193 document.setRawText(content);
194
195 KAFDocument.Public aPublic = document.createPublic();
196 aPublic.uri = "http://pikes.fbk.eu/enron/" + relativeFilePath;
197 aPublic.publicId = relativeFilePath;
198 KAFDocument.FileDesc fileDesc = document.createFileDesc();
199
200 fileDesc.title = message_id;
201 fileDesc.creationtime = sdf.format(thisDate);
202 fileDesc.author = from;
203
204 File outputFile = new File(outputPath + File.separator + relativeFilePath + ".naf");
205 outputFile.getParentFile().mkdirs();
206 document.save(outputFile);
207 System.out.print(" DONE");
208 } catch (Exception e) {
209 e.printStackTrace();
210 }
211
212
213 }
214 }
215
216 public static String readFile(final String file) throws IOException {
217 BufferedReader reader = new BufferedReader(new FileReader(file));
218 String line = null;
219 StringBuilder string_builder = new StringBuilder();
220 String ls = System.getProperty("line.separator");
221
222 try {
223 while ((line = reader.readLine()) != null) {
224 string_builder.append(line);
225 string_builder.append(ls);
226 }
227
228 return string_builder.toString();
229 } finally {
230 reader.close();
231 }
232 }
233 }