from lxml import etree
import os
from pathvalidate import sanitize_filename
import requests
from urllib.parse import urlparse, parse_qs
from copy import deepcopy
import glob
import html
import json
import random

# idempotent

cookie_sub = json.load(open(os.path.join(os.path.dirname(os.path.realpath(__file__)), "config.json")))["request"]["cookie"].split("=", 1)[1]
ua = json.load(open(os.path.join(os.path.dirname(os.path.realpath(__file__)), "config.json")))["request"]["ua"]

html_parser = etree.HTMLParser(encoding="utf-8")
index = []
oripic_set = set()

SINA_IMG_CDNS = [
    "https://wx1.sinaimg.cn/",
    "https://wx2.sinaimg.cn/",
    "https://wx3.sinaimg.cn/",
    "https://wx4.sinaimg.cn/",
]

def download_oripic(oripic_addr):
    print("Downloading original pic: " + oripic_addr)
    oripic_id = parse_qs(urlparse(oripic_addr).query)['u'][0]
    oripic_local_path = "./oripic/" + oripic_id + '.jpg'

    # Comment out this line to use original addr
    oripic_addr = random.choice(SINA_IMG_CDNS) + "large/" + oripic_id + ".jpg"

    if oripic_id in oripic_set:
        print("Already downloaded, skip")
        return oripic_local_path

    oripic_r = requests.get(
        oripic_addr,
        headers={
            "Accept": "*/*",
            "User-Agent": ua,
            "Accept-Encoding": "gzip, deflate,br",
        },
        cookies={"SUB": cookie_sub},
        proxies={},
        verify=True,
        stream=True)
    with open(oripic_local_path, "wb") as oripic_f:
        oripic_f.write(oripic_r.content)
    oripic_set.add(oripic_id)
    return oripic_local_path


os.makedirs("./oripic", exist_ok=True)
try:
    os.unlink("./index.html")
except:
    pass
for curr_html_file_path in sorted(glob.glob("./*.html"), reverse=True):
    curr_html_file_name = os.path.basename(curr_html_file_path)
    print(f"Processing {curr_html_file_name} ", end='')
    with open(curr_html_file_name, "rb") as input_f:
        raw = input_f.read()
    tree = etree.fromstring(raw, html_parser)
    if tree is None:
        print("<内容缺失>")
        if curr_html_file_name.endswith("_repost.html"):
            index.append(("<内容缺失>", "./" + curr_html_file_name))
        continue
    excerpt = tree.xpath("string(//div[contains(@id, 'M_')])").strip()[0:60]
    if excerpt == "":
        excerpt = "<内容缺失>"
    print(excerpt)
    if curr_html_file_name.endswith("_repost.html"):
        index.append((excerpt, "./" + curr_html_file_name))

    for n in tree.xpath('//body/div[@class="n"]'):
        n.getparent().remove(n)
    for n in tree.xpath('//body/div[@class="c tip"]'):
        n.getparent().remove(n)
    for n in tree.xpath('//body/div[@class="s"]'):
        n.getparent().remove(n)

    has_pic_all = False
    for n in tree.xpath('//a'):
        href = n.get('href')
        if href:
            if href.startswith('/mblog/picAll'):
                print("Downloading picAll page: " + href)
                pic_all_page = requests.get(f"https://weibo.cn{href}",
                    headers={
                        "Accept": "*/*",
                        "User-Agent": ua,
                        "Accept-Encoding": "gzip, deflate,br",
                    },
                    cookies={"SUB": cookie_sub},
                    proxies={},
                    verify=True,
                    stream=True
                ).content
                pic_all_tree = etree.fromstring(pic_all_page, html_parser)
                for nn in pic_all_tree.xpath('//body/div/a[@href="/"]'):
                    nn.getparent().getparent().remove(nn.getparent())
                for nn in pic_all_tree.xpath('//body/div/a[starts-with(@href, "/?")]'):
                    nn.getparent().getparent().remove(nn.getparent())
                for nn in pic_all_tree.xpath('//body/div/a[starts-with(@href, "/mblog/pic")]'):
                    nn.getparent().remove(nn)
                for nn in pic_all_tree.xpath('//body/div/a[starts-with(@href, "/mblog/oripic")]'):
                    oripic_addr = nn.get("href")
                    if not oripic_addr:
                        continue
                    oripic_addr = "https://weibo.cn" + oripic_addr
                    nn.text = ''
                    oripic_ele = etree.Element("img")
                    oripic_ele.set("style", "max-width: 150px;")
                    oripic_local_path = download_oripic(oripic_addr)
                    oripic_ele.set("src", oripic_local_path)
                    nn.insert(0, oripic_ele)
                    nn.set("href", oripic_local_path)
                pic_all_cloned = deepcopy(pic_all_tree.xpath('//body')[0])
                pic_all_cloned.tag = "div"
                n.getparent().getparent().insert(1, pic_all_cloned)
                has_pic_all = True
                n.set('href', "https://weibo.cn" + n.get('href'))
            elif n.get('href').startswith('/mblog/oripic'):
                if has_pic_all:
                    n.getparent().remove(n)
                else:
                    oripic_addr = n.get("href")
                    oripic_addr = "https://weibo.cn" + oripic_addr
                    n.text = ''
                    oripic_ele = etree.Element("img")
                    oripic_ele.set("style", "max-width: 150px;")
                    oripic_local_path = download_oripic(oripic_addr)
                    oripic_ele.set("src", oripic_local_path)
                    n.insert(0, oripic_ele)
                    n.set('href', oripic_local_path)
            elif n.get('href').startswith('/mblog/pic'):
                n.getparent().remove(n)
            elif n.get('href').startswith('/repost'):
                n.set("href", "./" + ("_".join(curr_html_file_name.split("_")[0:3])) + "_repost.html")
            elif n.get('href').startswith('/comment'):
                n.set("href", "./" + ("_".join(curr_html_file_name.split("_")[0:3])) + "_comment.html")
            elif n.get('href').startswith('/attitude'):
                n.set("href", "./" + ("_".join(curr_html_file_name.split("_")[0:3])) + "_attitude.html")
            elif n.text is not None and (n.text.strip().startswith("http://t.cn") or n.text.strip().startswith("https://t.cn")):
                out_url = n.text.strip()
                print("Translating t.cn url: " + out_url + " -> ", end='', flush=True)
                out_url_r = requests.get(out_url, allow_redirects=False)
                try:
                    true_url = out_url_r.headers["Location"].strip()
                    print(true_url)
                    n.text = true_url
                    n.set("href", true_url)
                except Exception as e:
                    print(repr(e))
            elif href.startswith("https://weibo.cn/sinaurl") or href.startswith("http://weibo.cn/sinaurl"):
                out_sina_url = href.strip()
                out_url = parse_qs(urlparse(out_sina_url).query)['u'][0]
                print("Translating (sinaurl) t.cn url: " + out_url + " -> ", end='', flush=True)
                try:
                    if out_url.startswith("http://t.cn") or out_url.startswith("https://t.cn"):
                        out_url_r = requests.get(out_url, allow_redirects=False)
                        true_url = out_url_r.headers["Location"].strip()
                    else:
                        true_url = out_url
                    print(true_url)
                    n.text = true_url
                    n.set("href", true_url)
                except Exception as e:
                    print(repr(e))
            elif n.get('href').startswith('/'):
                n.set('href', "https://weibo.cn" + n.get('href'))
    for n in tree.xpath('//a'):
        href = n.get('href')
        if href:
            if href.startswith("./oripic/"):
                img_id_ext = href[len("./oripic/"):]
                if os.path.exists("../pre-oct-2020/oripic/" + img_id_ext):
                    print(f"Merging <a> {img_id_ext}")
                    try:
                        os.unlink(href)
                    except:
                        pass
                    n.set("href", "../pre-oct-2020/oripic/" + img_id_ext)
    for n in tree.xpath('//img'):
        img_src = n.get('src')
        if img_src:
            if img_src.startswith("./oripic/"):
                img_id_ext = img_src[len("./oripic/"):]
                if os.path.exists("../pre-oct-2020/oripic/" + img_id_ext):
                    print(f"Merging <img> {img_id_ext}")
                    try:
                        os.unlink(img_src)
                    except:
                        pass
                    n.set("src", "../pre-oct-2020/oripic/" + img_id_ext)
    with open(curr_html_file_name, "w", encoding="utf-8") as output_f:
        output_f.write(etree.tostring(tree, pretty_print=True, encoding="unicode", method="html"))

with open("index.html", "w", encoding="utf-8") as index_f:
    index_f.write("""<!DOCTYPE html>
<html lang="en">
<head>
    <meta charset="UTF-8">
    <meta http-equiv="X-UA-Compatible" content="IE=edge">
    <meta name="viewport" content="width=device-width, initial-scale=1.0">
    <title>Weibo index</title>
</head>
<body>
    <h2>Weibo Index</h2>
    <table>
""")
    for name, path in index:
        index_f.write(f"        <tr><td><a href=\"{html.escape(path)}\">{html.escape(name)}</a></td><td><a href=\"{html.escape(path[:-len('_repost.html')] + '_comment.html')}\">Comments</a></td><td><a href=\"{html.escape(path[:-len('_repost.html')] + '_attitude.html')}\">Attitudes</a></td></tr>\n")
    index_f.write("""    </table>
</body>
</html>
""")
