Dredging the past and updating the present

A few months ago, while digging around my gargage, I found an old server of mine. I was looking for a particular bit of old data that I couldn't find on my current computer and thought it might be on this one. It didn't have what I was looking for but it did have a bunch of content from my old blog, going back to 2004.

It was a really old computer that wasn't cared for after I moved stuff to the cloud and the hard drives started failing reads almost as soon as I looked at the thing. However, with some running dd and a lot of finger-crossing, I was able to pull out a backup of the old blog. The backup, annoyingly, was a MySQL dump of a Wordpress page. After trying a handful of text based approaches and a couple of automated approaches, I figured I might as well install MySQL, load the dump into the database and hack together some one-off code to rip things out in a format that I liked.

The data extraction code isn't pretty but it gets the job done:

gen_olddata.py (Source)

#!/usr/bin/env python3
"""Pull old posts / comments from mysql."""

import collections
import datetime
import html
import os

import pypandoc
import sqlalchemy as sa

TARGET = os.path.join(
    os.path.dirname(__file__),
    '..',
    'posts',
)
os.makedirs(TARGET, exist_ok=True)

ENGINE = sa.create_engine('mysql+mysqlconnector://root@localhost/oldblog')
META = sa.MetaData(ENGINE)
POSTS = sa.Table('gw_posts', META, autoload=True)
COMMENTS = sa.Table('gw_comments', META, autoload=True)
TERMS = sa.Table('gw_terms', META, autoload=True)
TERM_TAXO = sa.Table('gw_term_taxonomy', META, autoload=True)
TERM_RELS = sa.Table('gw_term_relationships', META, autoload=True)

COMMENT_BLACKLIST = {
    887,
    20,
    21,
    844,
    61,
    62,
    439,
    951,
    952,
    978,
    979,
    993,
}

def text_fix(text):
    for search, replace in [
        ('\r', ''),
        ('ü', 'ü'),
        ('î', 'î'),
        ('’', '’'),
        ('“', '“'),
        ('…', '…'),
        ('â€', '”'),
        ('Ã¥', 'å'),
        ('Æ', 'Æ'),
        ('æ', 'æ'),
        ('ø', 'ø'),
        ('Ø', 'Ø'),
    ]:
        text = text.replace(search, replace)
    return text

def clean_content(content):
    content = text_fix(content)
    if '<p>' not in content:
        content = '<p>' + content.replace('\n\n', '</p><p>') + '</p>'
    content = pypandoc.convert_text(content, 'rst', format='html')
    while '\n\n\n' in content:
        content = content.replace('\n\n\n', '\n\n')
    content = content.strip()
    return content

for post in ENGINE.execute(sa.select([POSTS])):
    if post['post_status'] != 'publish':
        print('!skipping unpublished:', post['post_name'])
        continue
    print('processing:', post['post_date_gmt'].year, post['post_name'])
    slug = post['post_name']
    date = post['post_date_gmt'].replace(tzinfo=datetime.timezone.utc)
    updated = post['post_modified_gmt'].replace(tzinfo=datetime.timezone.utc)
    title = html.unescape(post['post_title'])
    content = clean_content(post['post_content'])

    tag_query = sa.select([
        TERMS.c.name
    ]).select_from(
        TERMS.join(
            TERM_TAXO, TERMS.c.term_id == TERM_TAXO.c.term_id
        ).join(
            TERM_RELS, TERM_TAXO.c.term_taxonomy_id == TERM_RELS.c.term_taxonomy_id
        )
    ).where(TERM_RELS.c.object_id == post['ID'])
    tag_list = [html.unescape(r['name']) for r in ENGINE.execute(tag_query)]

    post_meta = collections.OrderedDict()
    post_meta['slug'] = slug
    post_meta['date'] = str(date)
    if updated != date:
        post_meta['updated'] = str(updated)
    post_meta['tags'] = ', '.join(t.strip() for t in tag_list)

    folder = os.path.join(TARGET, f'{date.year:04d}', f'{date.month:02d}')
    os.makedirs(folder, exist_ok=True)
    path = os.path.join(folder, f'{slug}.rst')
    with open(path, 'wt') as outfile:
        outfile.write(title + '\n')
        outfile.write('=' * len(title) + '\n\n')
        for key, value in post_meta.items():
            outfile.write(f':{key}: {value}\n')
        outfile.write('\n')
        outfile.write(content + '\n')

    comment_query = sa.select([
        COMMENTS
    ]).where(COMMENTS.c.comment_post_ID == post['ID'])
    for comment in ENGINE.execute(comment_query):
        if comment['comment_approved'] != '1' or comment['comment_ID'] in COMMENT_BLACKLIST:
            print(
                ' -! skipping unapproved:',
                comment['comment_ID'],
                comment['comment_author'])
            continue
        print(' - processing comment', comment['comment_ID'])
        cid = comment['comment_ID']
        cauthor = text_fix(comment['comment_author'])
        cauthor_email = comment['comment_author_email']
        cauthor_url = comment['comment_author_url']
        cdate = comment['comment_date_gmt']
        cparent = comment['comment_parent']
        ccontent = clean_content(comment['comment_content'])

        comment_meta = collections.OrderedDict()
        comment_meta['id'] = str(cid)
        if cparent != 0:
            comment_meta['parent_id'] = str(cparent)
        comment_meta['author'] = cauthor
        if cauthor_email:
            comment_meta['author_email'] = cauthor_email
        if cauthor_url:
            comment_meta['author_url'] = cauthor_url
        comment_meta['date_utc'] = cdate
        comment_meta['compiler'] = 'rest'

        cpath = os.path.join(folder, f'{slug}.{cid}.wpcomment')
        with open(cpath, 'wt') as coutfile:
            for key, value in comment_meta.items():
                coutfile.write(f'.. {key}: {value}\n')
            coutfile.write('\n')
            coutfile.write(ccontent + '\n')

From there, I had a lot of cleanup work to do and some images to track down and copy over. Then there was some early-20s angst to remove.

Of course, that got me updating things and finding little problems with the display and layout of things. One thing led to another and I've ended up sinking a bunch of time into tiny content and style guide tweaks.