Alberto Fanjul commented:
The guys from mono finally recover the txt.gz!
https://github.com/mono/mail-archives/issues/1
Using:
-
mailmanToMBox.py
:
#!/usr/bin/env python3
"""
mailmanToMBox.py: Inserts line feeds to create mbox format from Mailman Gzip'd
Text archives decompressed
Usage: ./to-mbox.py dir
Where dir is a directory containing .txt files pulled from mailman Gzip'd Text and decompressed
"""
import sys
import os
import tokenize
def main():
if len(sys.argv) !=2:
print(__doc__)
sys.exit()
rootDir = sys.argv[1]
numConv = 0
for root, dirs, files in os.walk(rootDir):
for fil in files:
if(fil.find('.txt') > -1):
inFile = os.path.join(rootDir,fil)
outFile = inFile.replace('.txt','.mbox')
print('Converting ',fil,' to mbox format')
if not makeMBox(inFile,outFile):
print((outFile,' already exists, did not overwrite'))
else:
numConv +=1
print('Converted ' ,str(numConv),'archives to mbox format')
def makeMBox(fIn,fOut):
'''
from http://lists2.ssc.com/pipermail/linux-list/2006-February/026220.html
'''
if not os.path.exists(fIn):
return False
if os.path.exists(fOut):
return False
out = open(fOut,"w")
lineNum = 0
# detect encoding
readsource = open(fIn,'rb').__next__
try:
fInCodec = tokenize.detect_encoding(readsource)[0]
except:
fInCodec = "utf-8"
for line in open(fIn,'rt', encoding=fInCodec, errors="replace"):
if line.find("From ") == 0:
if lineNum != 0:
out.write("\n")
lineNum +=1
line = line.replace(" at ", "@")
out.write(line)
out.close()
return True
# INIT
if __name__ == '__main__':
main()
and
-
mail-archive-to-mbox
:
#!/bin/bash
if [ $# < 3 ]
then
echo "run as $0 <HOST> <MAIL-LIST-NAME>"
exit 1
fi
HOST="$1"
MAIL_LIST_NAME="$2"
# Download and unzip the list archive
wget -r -l1 --no-parent --no-directories "$HOST/$MAIL_LIST_NAME/" -P ./$MAIL_LIST_NAME -A "*-*.txt.gz"
gzip -d $MAIL_LIST_NAME/*.txt.gz
# convert to mbox
./mailmanToMBox.py $MAIL_LIST_NAME
# Concatanate mbox
cat $MAIL_LIST_NAME/*.mbox > $MAIL_LIST_NAME.mbox
I was able to create the mbox.
./mail-archive-to-mbox https://mono.github.io/mail-archives glade-users
./mail-archive-to-mbox https://mono.github.io/mail-archives glade-devel
@averi Not sure if mbox should be sorted to be correctly imported.
NOTE: I need to fix some dangling characters on 2000-September.txt (at first line in email name)
glade-users.mbox glade-devel.mbox
when that is imported I will fix glade-web#1 too