2023-10-30 10:44:39 -05:00

88 lines
2.8 KiB
Python

#!/usr/bin/env python
"""
2020 update:
- More iterators, fewer lists
- Python 3 compatible
- Processes files in parallel
(one thread per CPU, but that's not really how it works)
"""
import glob
import os
import email
from email import policy
from multiprocessing import Pool
<<<<<<< HEAD
import sys
import re
EXTENSION = "eml"
OUTREGEX = ".*\\\\"
INREGEX = ".*\\\\$"
=======
EXTENSION = "eml"
>>>>>>> parent of af712a2 (allows passing an input path as argument)
def extract(filename):
"""
Try to extract the attachments from all files in cwd
"""
# ensure that an output dir exists
filepath = re.findall(OUTREGEX, filename)
od = (filepath[0] or "") + "/"
os.path.exists(od) or os.makedirs(od)
output_count = 0
try:
with open(filename, "r") as f:
msg = email.message_from_file(f, policy=policy.default)
extractBody(msg)
for attachment in msg.iter_attachments():
try:
output_filename = attachment.get_filename()
except AttributeError:
print("Got string instead of filename for %s. Skipping." % f.name)
continue
# If no attachments are found, skip this file
if output_filename:
with open(os.path.join(od, output_filename), "wb") as of:
try:
of.write(attachment.get_payload(decode=True))
output_count += 1
except TypeError:
print("Couldn't get payload for %s" % output_filename)
if output_count == 0:
print("No attachment found for file %s!" % f.name)
# this should catch read and write errors
except IOError:
print("Problem with %s or one of its attachments!" % f.name)
return 1, output_count
if __name__ == "__main__":
# let's do this in parallel, using cpu count as number of threads
pool = Pool(None)
res = pool.map(extract, glob.iglob("*.%s" % EXTENSION))
# need these if we use _async
pool.close()
pool.join()
# 2-element list holding number of files, number of attachments
numfiles = [sum(i) for i in zip(*res)]
print("Done: Processed {} files with {} attachments.".format(*numfiles))
nothin = input('Press Enter to close...')
def extractBody(msg):
for part in msg.walk():
if part.get_content_type() != 'multipart' and part.get('Content-Disposition') is not None:
print("Found image in body")
imageName = part.get_filename()
with open("output/" + imageName, "wb") as out:
try:
out.write(part.get_payload(decode=True))
except TypeError:
print("Couldn't get payload for %s" % imageName)
else:
continue