gen_olddata.py (Source)

#!/usr/bin/env python3
"""Pull old posts / comments from mysql."""

import collections
import datetime
import html
import os

import pypandoc
import sqlalchemy as sa

TARGET = os.path.join(
    os.path.dirname(__file__),
    '..',
    'posts',
)
os.makedirs(TARGET, exist_ok=True)

ENGINE = sa.create_engine('mysql+mysqlconnector://root@localhost/oldblog')
META = sa.MetaData(ENGINE)
POSTS = sa.Table('gw_posts', META, autoload=True)
COMMENTS = sa.Table('gw_comments', META, autoload=True)
TERMS = sa.Table('gw_terms', META, autoload=True)
TERM_TAXO = sa.Table('gw_term_taxonomy', META, autoload=True)
TERM_RELS = sa.Table('gw_term_relationships', META, autoload=True)

COMMENT_BLACKLIST = {
    887,
    20,
    21,
    844,
    61,
    62,
    439,
    951,
    952,
    978,
    979,
    993,
}

def text_fix(text):
    for search, replace in [
        ('\r', ''),
        ('ü', 'ü'),
        ('î', 'î'),
        ('’', '’'),
        ('“', '“'),
        ('…', '…'),
        ('â€', '”'),
        ('Ã¥', 'å'),
        ('Æ', 'Æ'),
        ('æ', 'æ'),
        ('ø', 'ø'),
        ('Ø', 'Ø'),
    ]:
        text = text.replace(search, replace)
    return text

def clean_content(content):
    content = text_fix(content)
    if '<p>' not in content:
        content = '<p>' + content.replace('\n\n', '</p><p>') + '</p>'
    content = pypandoc.convert_text(content, 'rst', format='html')
    while '\n\n\n' in content:
        content = content.replace('\n\n\n', '\n\n')
    content = content.strip()
    return content

for post in ENGINE.execute(sa.select([POSTS])):
    if post['post_status'] != 'publish':
        print('!skipping unpublished:', post['post_name'])
        continue
    print('processing:', post['post_date_gmt'].year, post['post_name'])
    slug = post['post_name']
    date = post['post_date_gmt'].replace(tzinfo=datetime.timezone.utc)
    updated = post['post_modified_gmt'].replace(tzinfo=datetime.timezone.utc)
    title = html.unescape(post['post_title'])
    content = clean_content(post['post_content'])

    tag_query = sa.select([
        TERMS.c.name
    ]).select_from(
        TERMS.join(
            TERM_TAXO, TERMS.c.term_id == TERM_TAXO.c.term_id
        ).join(
            TERM_RELS, TERM_TAXO.c.term_taxonomy_id == TERM_RELS.c.term_taxonomy_id
        )
    ).where(TERM_RELS.c.object_id == post['ID'])
    tag_list = [html.unescape(r['name']) for r in ENGINE.execute(tag_query)]

    post_meta = collections.OrderedDict()
    post_meta['slug'] = slug
    post_meta['date'] = str(date)
    if updated != date:
        post_meta['updated'] = str(updated)
    post_meta['tags'] = ', '.join(t.strip() for t in tag_list)

    folder = os.path.join(TARGET, f'{date.year:04d}', f'{date.month:02d}')
    os.makedirs(folder, exist_ok=True)
    path = os.path.join(folder, f'{slug}.rst')
    with open(path, 'wt') as outfile:
        outfile.write(title + '\n')
        outfile.write('=' * len(title) + '\n\n')
        for key, value in post_meta.items():
            outfile.write(f':{key}: {value}\n')
        outfile.write('\n')
        outfile.write(content + '\n')

    comment_query = sa.select([
        COMMENTS
    ]).where(COMMENTS.c.comment_post_ID == post['ID'])
    for comment in ENGINE.execute(comment_query):
        if comment['comment_approved'] != '1' or comment['comment_ID'] in COMMENT_BLACKLIST:
            print(
                ' -! skipping unapproved:',
                comment['comment_ID'],
                comment['comment_author'])
            continue
        print(' - processing comment', comment['comment_ID'])
        cid = comment['comment_ID']
        cauthor = text_fix(comment['comment_author'])
        cauthor_email = comment['comment_author_email']
        cauthor_url = comment['comment_author_url']
        cdate = comment['comment_date_gmt']
        cparent = comment['comment_parent']
        ccontent = clean_content(comment['comment_content'])

        comment_meta = collections.OrderedDict()
        comment_meta['id'] = str(cid)
        if cparent != 0:
            comment_meta['parent_id'] = str(cparent)
        comment_meta['author'] = cauthor
        if cauthor_email:
            comment_meta['author_email'] = cauthor_email
        if cauthor_url:
            comment_meta['author_url'] = cauthor_url
        comment_meta['date_utc'] = cdate
        comment_meta['compiler'] = 'rest'

        cpath = os.path.join(folder, f'{slug}.{cid}.wpcomment')
        with open(cpath, 'wt') as coutfile:
            for key, value in comment_meta.items():
                coutfile.write(f'.. {key}: {value}\n')
            coutfile.write('\n')
            coutfile.write(ccontent + '\n')