2023-09-07 09:01:49 -05:00

80 lines
2.7 KiB
Python

#!/usr/bin/env python
"""
2020 update:
- More iterators, fewer lists
- Python 3 compatible
- Processes files in parallel
(one thread per CPU, but that's not really how it works)
"""
import glob
import os
import email
from email import policy
from multiprocessing import Pool
import sys
EXTENSION = "eml"
OUTPATH = ""
def extract(filename):
print(filename)
"""
Try to extract the attachments from all files in cwd
"""
# ensure that an output dir exists
od = "output"
os.path.exists(od) or os.makedirs(od)
output_count = 0
try:
with open(filename, "r") as f:
msg = email.message_from_file(f, policy=policy.default)
extractBody(msg)
for attachment in msg.iter_attachments():
try:
output_filename = attachment.get_filename()
except AttributeError:
print("Got string instead of filename for %s. Skipping." % f.name)
continue
# If no attachments are found, skip this file
if output_filename:
with open(os.path.join(od, output_filename), "wb") as of:
try:
of.write(attachment.get_payload(decode=True))
output_count += 1
except TypeError:
print("Couldn't get payload for %s" % output_filename)
if output_count == 0:
print("No attachment found for file %s!" % f.name)
# this should catch read and write errors
except IOError:
print("Problem with %s or one of its attachments!" % f.name)
return 1, output_count
if __name__ == "__main__":
path = sys.argv[1] or ""
# let's do this in parallel, using cpu count as number of threads
pool = Pool(None)
res = pool.map(extract, glob.iglob("%s*.%s" % (path, EXTENSION)))
# need these if we use _async
pool.close()
pool.join()
# 2-element list holding number of files, number of attachments
numfiles = [sum(i) for i in zip(*res)]
print("Done: Processed {} files with {} attachments.".format(*numfiles))
nothin = input('Press Enter to close...')
def extractBody(msg):
for part in msg.walk():
if part.get_content_type() != 'multipart' and part.get('Content-Disposition') is not None:
print("Found image in body")
imageName = part.get_filename()
with open("output/" + imageName, "wb") as out:
try:
out.write(part.get_payload(decode=True))
except TypeError:
print("Couldn't get payload for %s" % imageName)
else:
continue