import json
import os
import re
import time
import traceback
from datetime import datetime

import requests
from lxml import etree
from playwright.sync_api import sync_playwright
from secure_db_client import get_remote_engine


def mysql_connect():
    engine_us_mysql = get_remote_engine(
        site_name='us',  # -> database "selection"
        db_type='postgresql_15_outer',  # -> 服务端 alias "mysql"
    )
    return engine_us_mysql

def parse_list(s: str):
    # 把 "[a, b, c]" 这样的值转成 ["a","b","c"]
    return [t.strip() for t in s.strip().strip('[]').split(',') if t.strip()]

def clean_text(node):
    # 不用 normalize-space()，改用 Python 来规整空白
    return " ".join("".join(node.itertext()).split())

def extract_visible_headers(doc):
    print('extract_visible_headers')
    # 读取被隐藏的列 id 列表
    hidden_val = doc.xpath('//input[@id="hiddenColumnTitles"]/@value')
    hidden_ids = set(parse_list(hidden_val[0])) if hidden_val else set()

    headers = []
    seen = set()
    # 只看表头行，按出现顺序取 th，并排除隐藏列
    for th in doc.xpath('//tr[@id="head-row"]/th[@id]'):
        col_id = th.get('id')
        if col_id in seen:
            continue
        seen.add(col_id)
        if col_id in hidden_ids:
            continue
        # 进一步防御：如果有内联隐藏样式/类名，也跳过
        style = (th.get('style') or '').replace(' ', '').lower()
        classes = (th.get('class') or '')
        if 'display:none' in style or 'a-hidden' in classes.split():
            continue
        label = clean_text(th)
        if label:
            headers.append((col_id, label))
    return headers

def extract_rows(doc, headers):
    rows = []
    # 页面里的数据行都带有 mt-row 类
    for tr in doc.xpath('//table[contains(@class,"mt-table")]//tr[contains(@class,"mt-row")]'):
        row = {}
        for col_id, label in headers:
            # 数据单元格用 data-column 指出对应列 id
            td = tr.xpath('.//td[@data-column=$c]', c=col_id)
            value = clean_text(td[0]) if td else ""
            row[label] = value
        # 只要有任何可见列有值，就认为是有效数据行
        if any(row.values()):
            rows.append(row)
    return rows

def run_spider(asin_list):
    print('asin_list:::', asin_list)
    print('asin_list:::', len(asin_list))
    if asin_list:
        try:
            pr_name = "chrome.exe"
            os.system('%s%s' % ("taskkill /F /IM ", pr_name))
        except Exception as e:
            print("强制关闭chrome.exe失败：", e)
        # 初始化
        with sync_playwright() as _playwright:
            context = _playwright.chromium.launch_persistent_context(
                # 指定本机用户缓存地址
                user_data_dir=r"C:\Users\admin\AppData\Local\Google\Chrome\User Data",
                # 指定本机google客户端exe的路径
                executable_path=r"C:\Users\admin\AppData\Local\Google\Chrome\Application\chrome.exe",
                # 要想通过这个下载文件这个必然要开  默认是False
                accept_downloads=True,
                # 设置不是无头模式
                headless=False,  # False 打开。 True 无头浏览器
                bypass_csp=True,
                locale='en-GB',
                ignore_https_errors=True,
                no_viewport=True,
                slow_mo=10
            )
            page = context.pages[0] if context.pages else context.new_page()

            print('打开浏览器请求asin:')

            # page.goto('https://sellercentral.amazon.com/gp/homepage.html/ref=xx_home_logo_xx?')
            # page.wait_for_timeout(1500)

            def intercept_request(request):
                try:
                    if "populate" in request.url:
                        print('request.url::', request.url)
                        resp = request.response().text()
                        html_data_list.append(resp)
                except Exception as e:
                    print("获取响应失败:", e, f"\n{traceback.format_exc()}")

            # try:
            page.goto(
                'https://sellercentral.amazon.com/listing/varwiz?ref_=xx_swlang_head_xx&mons_sel_locale=en_US&languageSwitched=1')
            page.wait_for_timeout(1500)
            print('page.url::', page.url)
            for id_asin in asin_list:
                print("id_asin::", id_asin)
                id_asin_list = id_asin.split('|-|')
                asin = id_asin_list[0]
                id = int(id_asin_list[1])
                print('开始抓取：：', asin, 'id：：', id)
                error = None
                if 'signin' in page.url:
                    save_asin_var_data(asin, [], '失败', 'us', [], [], '账号电脑退出登录', id)
                    semd_ms(asin)
                    time.sleep(120)
                    return
                html_data_list = []
                # page.reload() # 刷新页面
                page.wait_for_timeout(1000)
                page.locator('//*[@id="varwizard_accordion"]/div[1]/div/div[1]/h5/a/i').click()
                page.wait_for_timeout(1000)
                page.locator('//*[@id="varwiz-search-text"]').fill(f'{asin}')
                page.wait_for_timeout(5000)
                page.on("requestfinished", intercept_request)
                page.locator('//*[@id="a-autoid-0"]/span/input').click()  # 点击
                page.wait_for_timeout(15000)
                # 请求 指定的 URL
                items_list = []
                parent_asin_list = []
                var_asin_list = []
                print(f'请求asin {asin} data_list::', len(html_data_list))
                for html_content in html_data_list:
                    if (
                            'variation family is not supported on this' in html_content and 'ERROR' in html_content) or (
                            'you searched for is not' in page.content()):
                        try:
                            error = re.findall(r'message =(.*?);', html_content)[0]
                        except:
                            error = "The ASIN you searched for is not part of any variation family"
                        print('errorerror::', error)
                        save_asin_var_data(asin, [], '失败', 'us', [], [], error, id)
                        break
                    tree = etree.HTML(html_content)
                    visible_headers = extract_visible_headers(tree)
                    rows = extract_rows(tree, visible_headers)
                    if rows:
                        for i in rows:
                            print(i, '233333333333')
                            Parentage = i.get('Parentage')
                            if Parentage == 'parent':
                                parent_asin = i.get('ASIN')
                                parent_asin_list.append(parent_asin)
                            if Parentage == 'child':
                                var_asin = i.get('ASIN')
                                var_asin_list.append(var_asin)
                        items_list.extend(rows)
                if items_list:
                    print('items_listitems_list::', len(items_list))
                    items_list_json = json.dumps(items_list)
                    save_asin_var_data(asin, items_list_json, '成功', 'us', parent_asin_list, var_asin_list, None, id)
                    html_data_list = []
            # except Exception as e:
            #     print(asin, '报错：：', e)
            #     semd_ms(asin)
            #     save_asin_var_data(asin, [], '失败', 'us', [], [], None, id)


def semd_ms(asin):
    try:
        url = 'http://47.112.96.71:8082/selection/sendMessage'
        data = {
            'content': '下载 变体数据失败。远程账号电脑 HM 299 421 380',
            'title': f'账号电脑  {asin} 变体数据 失败',
            'account': 'pengyanbing'
        }
        print(data)
        y = requests.post(url=url, data=data, timeout=15)
        print(y.content.decode('gbk'))
    except:
        pass


def mysql_get_asin():
    while True:
        try:
            print('轮询 mysql 查询:', datetime.now().strftime("%m-%d %H:%M:%S"))
            engine_us_mysql = mysql_connect()
            spider_state_sql = """select asin,id from asin_variation_family_log where status = '未开始' and length(asin)=10 limit 20 """
            print('spider_state_sql:', spider_state_sql)
            df_asin = engine_us_mysql.read_sql(spider_state_sql)
            if not df_asin.empty:
                update_time = int(time.time())
                with engine_us_mysql.begin() as conn:
                    index_tuple = tuple(df_asin['id'])
                    if len(index_tuple) == 1:
                        sql_update = f"""UPDATE asin_variation_family_log a set status='爬取中',update_time='{update_time}' where a.id in ({index_tuple[0]})"""
                    else:
                        sql_update = f"""UPDATE asin_variation_family_log a set status='爬取中',update_time='{update_time}' where a.id in {index_tuple}"""
                    print('UPDATE_sql:', sql_update)
                    conn.execute(sql_update)
                _asin_lis = list(df_asin.asin + '|-|' + df_asin.id.astype("U"))
                print("_asin_lis:::", _asin_lis, )
                print("_asin_lis::: len ", len(_asin_lis))
                run_spider(_asin_lis)  # 传递asin 列表
            time.sleep(3)
            # break
        except Exception as e:
            print('查询 mysql_get_asin 报错：：', e, f"\n{traceback.format_exc()}")


def save_asin_var_data(asin, data_json, spider_value, site_name, parent_asin_list, var_asin_list, error, id):
    if parent_asin_list:
        parent_asin = ','.join(parent_asin_list)
    else:
        parent_asin = ""
    if len(var_asin_list) == 0:
        var_asin_list = "'{}'"
    else:
        var_asin_list = "'" + '{' + ','.join(var_asin_list) + '}' + "'"

    if data_json:
        data_json = data_json.replace('%', '%%').replace("'", "").replace("\'", "")
    if error:
        error = error.replace('%', '%%').replace("'", "").replace('"', '').replace("\'", "")
    while True:
        try:
            engine_us_mysql = mysql_connect()
            update_time = int(time.time())
            print(f'更新 {asin} 数据：')
            with engine_us_mysql.begin() as conn:
                if error is None:
                    sql = f"""
                    UPDATE asin_variation_family_log 
                    SET variation_family='{data_json}', status='{spider_value}', update_time='{update_time}' ,parent_asin='{parent_asin}',variation_asin={var_asin_list}
                    WHERE id={id} AND site_name='{site_name}'
                    """
                    print('成功',sql)
                elif error == '账号电脑退出登录':
                    sql = f"""
                        UPDATE asin_variation_family_log SET status='未开始' WHERE id={id} AND site_name='{site_name}'
                        """
                else:
                    sql = f"""
                            UPDATE asin_variation_family_log 
                            SET variation_family='{data_json}', status='{spider_value}', update_time='{update_time}' ,parent_asin='{parent_asin}',variation_asin={var_asin_list},err_msg='{error}'
                            WHERE id={id} AND site_name='{site_name}'
                           """
                    print('error is not None：： ', sql)
                conn.execute(sql)
            print(asin, '更新成功')
            break
        except Exception as e:
            print('存储数据报错：', e)
            time.sleep(5)


if __name__ == '__main__':
    mysql_get_asin()
