1   package eu.fbk.dkm.pikes.resources.enronEmailDataset;
2   
3   import com.google.common.io.Files;
4   import eu.fbk.dkm.pikes.resources.ecb.ConvertECBPlus;
5   import eu.fbk.utils.core.CommandLine;
6   import eu.fbk.utils.core.IO;
7   import ixa.kaflib.KAFDocument;
8   import org.apache.commons.lang.time.DateUtils;
9   import org.apache.commons.mail.util.MimeMessageParser;
10  import org.apache.james.mime4j.codec.DecodeMonitor;
11  import org.apache.james.mime4j.message.DefaultBodyDescriptorBuilder;
12  import org.apache.james.mime4j.parser.ContentHandler;
13  import org.apache.james.mime4j.parser.MimeStreamParser;
14  import org.apache.james.mime4j.stream.BodyDescriptorBuilder;
15  import org.apache.james.mime4j.stream.MimeConfig;
16  import org.slf4j.Logger;
17  import org.slf4j.LoggerFactory;
18  
19  
20  import javax.mail.Header;
21  import javax.mail.Session;
22  import javax.mail.internet.MimeMessage;
23  import java.io.*;
24  import java.nio.charset.StandardCharsets;
25  import java.text.ParseException;
26  import java.text.SimpleDateFormat;
27  import java.util.Date;
28  import java.util.Enumeration;
29  import java.util.Properties;
30  import java.util.regex.Matcher;
31  import java.util.regex.Pattern;
32  
33  /**
34   * Created by marcorospocher on 24/04/2017.
35   */
36  public class Email2NAF {
37      private static final Logger LOGGER = LoggerFactory.getLogger(Email2NAF.class);
38      private static Pattern folderPattern = Pattern.compile("^([0-9]+)");
39      private static SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd'T'HH:mm:ss.SSSZ");
40      private static String[] parsePatterns = {
41              "E, dd-MMM-yy HH:mm:ss z",  //Wednesday, 01-Jan-97 15:20:23 GMT
42              "E, dd MMM yy HH:mm:ss z", //Fri, 17 Jan 97 02:15:05 GMT
43              "E, dd MMM yyyy HH:mm:ss z", //Wed, 01 Jan 1997 15:21:07 GM
44              "E MMM dd HH:mm:ss yyyy",    //Wed Mar  6 13:36:27 1996
45              "E, dd-MMM-yy HH:mm:ss yyyy",    //Friday, 21-Feb-97 15:29:07 1997
46              ", dd MMM yyyy HH:mm:ss z",    //, 17 Feb 1997 14:15:0 GMT
47              "MMM dd, yyyy",    //January 23, 1997
48              "E, dd-MMM-yy z", //Friday, 07-Feb-97 ? GMT
49              "E, dd MMM yy z", //Mon, 10 Feb 1997 ? GMT
50              //OK Fri, 13 Jan 1997 22:13:59 GMT
51              //OK Mon,10 Feb 97 14:00:41 +0000
52              "E, dd-MMM-yy HH z", //Sunday, 12-Jan-97 00 GMT
53  //            DATE PROBLEM!!! :Sunday, 12-Jan-97 00 GMT
54              //Tuesday, 21-Jan-97 00 GMT
55              "E MMM dd HH:mm:ss z yy",//Tue May 08 13:11:00 CEST 2001
56              "E, dd MMM yyyy HH:mm:ss Z (z)" //Wed, 24 Jan 2001 05:59:00 -0800 (PST)
57  
58      };
59      private static String xml11pattern = "[^"
60              + "\u0001-\uD7FF"
61              + "\uE000-\uFFFD"
62              + "\ud800\udc00-\udbff\udfff"
63              + "]+";
64      private static String xml10pattern = "[^"
65              + "\u0009\r\n"
66              + "\u0020-\uD7FF"
67              + "\uE000-\uFFFD"
68              + "\ud800\udc00-\udbff\udfff"
69              + "]";
70  
71      public static void main(String[] args) {
72          final CommandLine cmd = CommandLine
73                  .parser()
74                  .withName("convert-enron")
75                  .withHeader("Convert Enron Email Dataset files to NAF")
76                  .withOption("i", "input-path", "the base path of the corpus", "DIR",
77                          CommandLine.Type.DIRECTORY_EXISTING, true, false, true)
78                  .withOption("o", "output-path", "output NAF folder", "DIR",
79                          CommandLine.Type.DIRECTORY, true, false, true)
80                  .withLogger(LoggerFactory.getLogger("eu.fbk")).parse(args);
81  
82          final File inputPath = cmd.getOptionValue("i", File.class);
83          final File outputPath = cmd.getOptionValue("o", File.class);
84  
85          boolean opMkDirs = outputPath.mkdirs();
86          if (!opMkDirs) {
87              LOGGER.error("Unable to create folder {}", outputPath.getAbsolutePath());
88          }
89  
90  
91          for (final File file : Files.fileTreeTraverser().preOrderTraversal(inputPath)) {
92              if (!file.isFile()) {
93                  continue;
94              }
95              if (file.getName().startsWith(".")) {
96                  continue;
97              }
98  
99  
100 
101                 String relativeFilePath = file.getAbsolutePath().substring(inputPath.getAbsolutePath().length());
102                 if (relativeFilePath.startsWith(File.separator)) {
103                     relativeFilePath = relativeFilePath.substring(1);
104                 }
105 
106                 try {
107                     System.out.println();
108                     System.out.print("Starting processing of file: "+file.getAbsolutePath());
109 
110                     KAFDocument document = new KAFDocument("en", "FBK");
111 
112 //IO.read(nafFile.getAbsolutePath())
113 
114 
115 //                    ContentHandler contentHandler = new CustomContentHandler();
116 //
117 //                    MimeConfig mime4jParserConfig = new MimeConfig();
118 //                    BodyDescriptorBuilder bodyDescriptorBuilder = new DefaultBodyDescriptorBuilder();
119 //                    MimeStreamParser mime4jParser = new MimeStreamParser(mime4jParserConfig, DecodeMonitor.SILENT,bodyDescriptorBuilder);
120 //                    mime4jParser.setContentDecoding(true);
121 //                    mime4jParser.setContentHandler(contentHandler);
122 //                    mime4jParser.parse(IO.read(file.getAbsolutePath()));
123 //
124 //                    Email email = ((CustomContentHandler) contentHandler).getEmail();
125 //
126 //                    Attachment plainText = email.getPlainTextEmailBody();
127 //                    //String to = email.getToEmailHeaderValue();
128 //                    //String cc = email.getCCEmailHeaderValue();
129 //                    String from = email.getFromEmailHeaderValue();
130 //                    String date = email.getHeader().getField("Date").getBody();
131 //                    String messageID = email.getHeader().getField("Message-ID").getBody();
132 
133 
134 
135 
136                     Session s = Session.getDefaultInstance(new Properties());
137                     String raw = readFile(file.getAbsolutePath());
138                     InputStream is = new ByteArrayInputStream(raw.getBytes());
139 
140 
141 //                            IO.read(file.getAbsolutePath());
142                     MimeMessage message = new MimeMessage(s, is);
143 
144                     message.getAllHeaderLines();
145                     String from = "";
146                     for (Enumeration<Header> e = message.getAllHeaders(); e.hasMoreElements();) {
147                         Header h = e.nextElement();
148                         if (h.getName().contains("X-From")) {
149                             //System.out.println(h.getValue());
150                             from = h.getValue().replaceAll(xml10pattern,"");;
151                         }
152                     }
153 
154 
155                     MimeMessageParser parser = new MimeMessageParser(message);
156 
157 
158 //                    String subject = parser.getSubject();
159                     String message_id = parser.getMimeMessage().getMessageID();
160                     String date = parser.getMimeMessage().getSentDate().toString();
161 
162 
163                     //System.out.println(parser.getMimeMessage().getContentType());
164 
165 //                    Object temp = parser.getMimeMessage().
166 //
167 
168                     //String content = new String(bytes, StandardCharsets.US_ASCII);
169                     String content = (String) parser.getMimeMessage().getContent();
170                     //byte[] temp = content.getBytes(StandardCharsets.UTF_8);
171                     //content = new String(temp);
172 
173 
174                     content = content.replaceAll(xml10pattern,"");
175 
176 
177 
178 
179 //                    int n = plainText.getIs().available();
180 //                    byte[] bytes = new byte[n];
181 //                    plainText.getIs().read(bytes, 0, n);
182 //                    String s = new String(bytes, StandardCharsets.US_ASCII);
183 
184 
185                     Date thisDate = null;
186                     try {
187                         thisDate = DateUtils.parseDate(date, parsePatterns);
188 
189                     } catch (ParseException e) {
190                         System.out.println("DATE PROBLEM!!!");
191                     }
192 
193                     document.setRawText(content);
194 
195                     KAFDocument.Public aPublic = document.createPublic();
196                     aPublic.uri = "http://pikes.fbk.eu/enron/" + relativeFilePath;
197                     aPublic.publicId = relativeFilePath;
198                     KAFDocument.FileDesc fileDesc = document.createFileDesc();
199 
200                     fileDesc.title = message_id;
201                     fileDesc.creationtime = sdf.format(thisDate);
202                     fileDesc.author = from;
203 
204                     File outputFile = new File(outputPath + File.separator + relativeFilePath + ".naf");
205                     outputFile.getParentFile().mkdirs();
206                     document.save(outputFile);
207                     System.out.print("  DONE");
208                 } catch (Exception e) {
209                     e.printStackTrace();
210                 }
211 
212 
213         }
214     }
215 
216     public static String readFile(final String file) throws IOException {
217         BufferedReader reader = new BufferedReader(new FileReader(file));
218         String line = null;
219         StringBuilder string_builder = new StringBuilder();
220         String ls = System.getProperty("line.separator");
221 
222         try {
223             while ((line = reader.readLine()) != null) {
224                 string_builder.append(line);
225                 string_builder.append(ls);
226             }
227 
228             return string_builder.toString();
229         } finally {
230             reader.close();
231         }
232     }
233 }