import time
import random
import uuid
import json
from urllib.parse import quote_plus
from collections import deque
from concurrent.futures import ThreadPoolExecutor

import requests
from lxml import etree
import pandas as pd
from sqlalchemy import create_engine, text, bindparam
from sqlalchemy.engine import URL
from sqlalchemy.pool import NullPool

from utils.requests_param import Requests_param_val
from utils.db_connect import BaseUtils
from threading_spider.db_connectivity import connect_db

from amazon_params.params import DB_CONN_DICT


class CategoryParser:
    @staticmethod
    def safe_index(seq, idx, default=None):
        return seq[idx] if 0 <= idx < len(seq) else default

    @classmethod
    def parse(cls, nodes_num, url):
        parts = url.rstrip('/').split('/')
        has_ref = 'ref=' in url
        if not has_ref:
            ref_suffix = None
            cid = cls.safe_index(parts, -1)
            first_id = cls.safe_index(parts, -2)
        else:
            ref_suffix = cls.safe_index(parts, -1)
            first_id = cls.safe_index(parts, -3)
            cid = cls.safe_index(parts, -2)

        # Determine level
        if nodes_num == 1:
            level = 1
        elif url.endswith('_0'):
            level = 2
        elif url.endswith(f"{first_id}_1"):
            level = 3
        else:
            level = 4

        # Compute parent
        if level == 1:
            pid, cid, first_id = None, '0', None
        elif level == 2:
            pid = '0'
            first_id = cid
        elif level == 3:
            pid = first_id
        else:
            pid = ref_suffix.split('_')[-1] if ref_suffix else None

        return {
            'category_id': cid,
            'category_first_id': first_id,
            'category_parent_id': pid
        }


class RequestSession:
    def __init__(self, site):
        self.site = site
        self.site_url = f'https://www.amazon.{site}'
        self.host = f'www.amazon.{site}'
        self.session = requests.Session()
        self.cookies = deque()
        self.param_val = Requests_param_val(site_name=site)

    def ensure_cookies(self):
        if not self.cookies:
            ck = self.param_val.get_cookie()
            for v in ck.values():
                self.cookies.append(v)

    def next_cookie(self):
        self.ensure_cookies()
        raw = self.cookies.popleft()
        try:
            lst = json.loads(raw)
        except Exception:
            lst = eval(raw)
        # Ensure list
        if isinstance(lst, dict):
            lst = [lst]
        if not isinstance(lst, list):
            lst = []
        # Build cookie string
        cookie_str = ''
        for c in lst:
            try:
                cookie_str += f"{c['name']}={c['value']};"
            except Exception:
                continue
        return cookie_str

    def build_headers(self, referer):
        n = random.randint(70, 114)
        ua = (
            f"Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
            f"AppleWebKit/537.36 (KHTML, like Gecko) "
            f"Chrome/{n}.0.{random.randint(1000,5000)}.{random.randint(1,181)} Safari/537.36"
        )
        headers = {
            'Connection': 'close',
            'Authority': self.host,
            'Accept': 'text/html,*/*',
            'Content-Type': 'application/x-www-form-urlencoded;charset=UTF-8',
            'Origin': referer,
            'Referer': referer,
            'User-Agent': ua,
            'Cookie': self.next_cookie()
        }
        headers[random.choice('abcdef')] = str(uuid.uuid4())
        return headers

    def fetch(self, url, retries=5, timeout=20):
        for attempt in range(retries):
            try:
                hdr = self.build_headers(url)
                resp = self.session.get(url, headers=hdr, timeout=timeout, verify=False)
                txt = resp.text
                if resp.status_code in (200, 201) and 'Enter the characters' not in txt:
                    return etree.HTML(txt)
            except Exception as e:
                print(f"[fetch] attempt {attempt} error: {e}")
                time.sleep(1 + attempt)
        raise RuntimeError(f"无法获取 URL: {url}")


class BSRCategorySpider(BaseUtils):
    def __init__(self, site):
        super().__init__()
        self.site = site
        self.req = RequestSession(site)
        self.parser = CategoryParser()
        self._init_db()
        self.asin_items = []
        self.week = int(time.strftime("%W"))
        self.year_month = time.strftime("%Y_%m")
        self.time_strftime_ = time.strftime("%Y-%m-%d", time.localtime())
        self.columns = [
            'bsr_id', 'asin', 'bsr_rank', 'price',
            'rating', 'reviews', 'week', 'year_month', 'cate_current_id'
        ]

    def _init_db(self):
        cfg = DB_CONN_DICT
        url = URL.create(
            drivername='mysql+pymysql',
            username=cfg['mysql_user'],
            password=cfg['mysql_pwd'],
            host=cfg['mysql_host'],
            port=cfg['mysql_port'],
            database=cfg['mysql_db'],
            query={'charset': 'utf8mb4'}
        )
        self.engine = create_engine(
            url,
            poolclass=NullPool,
            connect_args={'connect_timeout': 10}
        )

    def process_node(self, rec, level):
        _id, name, path, cid = rec
        tree = self.req.fetch(path)
        xpath_expr = (
            f"//div[@role='treeitem' and span/text()='{name}']"
            "/following-sibling::div[@role='group']/div[@role='treeitem']/a"
        )
        a_nodes = tree.xpath(xpath_expr)
        parent_info = self.parser.parse(level, path)
        next_recs = []
        for a in a_nodes:
            href = a.xpath('./@href')[0]
            full = self.req.site_url + href
            info = self.parser.parse(level+1, full)
            next_recs.append((info['category_id'], name, full, parent_info['category_id']))
        self._collect_asin(tree, _id, parent_info['category_id'])
        return next_recs

    def _collect_asin(self, tree, bsr_id, current_cid):
        recs = tree.xpath("//div[@class='p13n-desktop-grid']/@data-client-recs-list")
        if not recs:
            return
        data_list = eval(recs[0])
        for data in data_list:
            asin = data.get('id')
            rank = data.get('metadataMap', {}).get('render.zg.rank')
            self.asin_items.append(
                (bsr_id, asin, int(rank), None, None, None,
                 self.week, self.year_month, current_cid)
            )

    def run_level(self, level, max_workers=10):
        df = pd.read_sql(
            f"SELECT id,en_name,path,category_id FROM {self.site}_bs_category "
            f"WHERE nodes_num={level} AND category_state=1 AND delete_time IS NULL",
            self.engine
        )
        if df.empty:
            return []
        ids = tuple(df['category_id'])
        # mark processed using expanding
        stmt = text(
            f"UPDATE {self.site}_bs_category SET category_state=2 "
            f"WHERE category_id IN :ids"
        ).bindparams(bindparam('ids', expanding=True))
        with self.engine.begin() as conn:
            conn.execute(stmt, {'ids': ids})

        recs = list(df.to_records(index=False))
        next_recs = []
        with ThreadPoolExecutor(max_workers=max_workers) as executor:
            for out in executor.map(lambda r: self.process_node(r, level), recs):
                next_recs.extend(out)
        return next_recs

    def save_asin_data(self):
        if not self.asin_items:
            return
        df = pd.DataFrame(self.asin_items, columns=self.columns)
        df['date_info'] = self.time_strftime_
        df.drop_duplicates(['asin', 'bsr_rank', 'cate_current_id'], inplace=True)
        df.to_sql(
            f'{self.site}_bs_category_top100_asin',
            con=self.engine,
            if_exists='append',
            index=False
        )

    def send_ms(self):
        for _ in range(3):
            try:
                url = 'http://selection.yswg.com.cn:8080/soundasia_selection/workflow/emit'
                data = {
                    'dateType': 'day',
                    'reportDate': self.time_strftime_,
                    'statusVal': 3,
                    'siteName': self.site,
                    'remark': 'bsr榜单爬取完毕',
                    'isEnd': '是',
                    'tableName': f'{self.site}_bs_category_top100_asin',
                    'status': 'bsr榜单爬取完毕'
                }
                resp = requests.post(url, headers={'Content-Type':'application/json'}, json=data, timeout=10)
                print('通知返回:', resp.text)
                break
            except Exception as e:
                print('send_ms error:', e)
                time.sleep(5)

    def run(self, max_level=4):
        level = 1
        while level <= max_level:
            next_nodes = self.run_level(level)
            if not next_nodes:
                break
            level += 1
        self.save_asin_data()


if __name__ == '__main__':
    for site in ['us']:
        spider = BSRCategorySpider(site)
        spider.run()
        spider.send_ms()
