import sys
import os

sys.path.append(os.path.dirname(sys.path[0]))  # 上级目录
from amazon_save_db.save_asin_detail_pg import Save_asin_detail
from amazon_params import py_ja3
from queue import Queue
from lxml import etree
import requests
import urllib3
import threading
import random
import uuid
import pandas as pd
import time
import traceback

urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
sess = requests.Session()
urllib3.disable_warnings()


class async_asin_pg():
    def __init__(self, site_name='us', proxy_name=None, week=None, month=None, spider_int=None):
        self.site_name = site_name  # 站
        self.save_asin_detail = Save_asin_detail(site_name='us', proxy_name=proxy_name, month=month,
                                                 spider_int=spider_int)
        self.engine = self.save_asin_detail.engine
        self.reuests_para_val = self.save_asin_detail.reuests_para_val
        self.requests_error_asin_list = []  # 1
        self.asin_not_found_list = []  # 4
        self.asin_not_sure_list = []  # 6
        self.asin_not_foot_list = []  # 7
        self.asin_not_foot2_list = []  # 8
        self.asin_not_buyBox_list = []  # 9
        self.asin_not_response_list = []  # 10
        self.asin_not_redirect_list = []  # 12
        self.asin_not_div_id_dp_list = []  # 返回html没有包含div @id=dp,状态13
        self.asin_list_update = []  # 3
        self.cookies_queue = Queue()  # cookie队列
        self.item_queue = Queue()  # 存储 item 详情数据队列
        self.queries_asin_queue = Queue()  # 需要爬取的asin队列
        self.buyBox_list = []  # 卖家名称 url 列表
        self.asin_detail_list = []  # 存储asin 详情的列表
        self.buyBoxname_asin_list = []  # asin 卖家的列表item
        self.delete_cookies_list = []  # 存储出现中国邮编的cookie
        self.stop_item_queue = True  # 用于是否退出循环存储的条件
        self.spider_de_feedback = False
        self.cookie_dict_delete_id = {}
        self.headers_num_int = 0
        # 返回 对应站点的host，首页链接
        if self.site_name == 'nl':
            self.host = 'www.amazon.nl'
            self.site = 'Amazon.nl'
            self.site_url = 'https://www.amazon.nl/'
        elif self.site_name == 'be':
            self.host = 'www.amazon.com.be'
            self.site_url = 'https://www.amazon.com.be/'
            self.site = 'Amazon.com.be'
        self.db_syn = 'mx_self_asin'
        # 验证码 1
        self.yzm_err_total_list = []
        # 异常 2
        self.asin_request_errp_total_list = []
        # 成功 3
        self.success_asin_total_list = []
        # 每小时
        self.hour_total_count_list = []
        # 总请求 4
        self.request_total_count_list = []
        self.asin_detail_list = []

    def get_asin(self):
        while True:
            if self.queries_asin_queue.empty() == False:
                # if self.cookies_queue.empty():
                #     cookies_dict = self.get_cookie()
                #     self.cookie_dict_delete_id = cookies_dict
                #     for ck in cookies_dict.values():
                #         self.cookies_queue.put(ck)
                    # 获取组装cookie
                if self.site_name == 'nl':
                    cookie_str = 'session-id-time=2082787201l; i18n-prefs=EUR; lc-acbnl=nl_NL; session-id=258-9877427-7113708; ubid-acbnl=260-0748357-3075024; session-token=UTNqLx1RowN43cl0tuKIbus1eJ4mdzedRH8K+EKNBMCufgND3B/1CEWTOJOKN3cyUVR6ewVTBY5reDimpP0aupMiF1mmSuARVQ5YKT3RYY812nTvMvOJuabHTmjaEtWDwC+xoa4ONIB6tWbqLNe7i2anozIl2udF3GUGHLXi4bkNLgMASESfzLT3S1w/zGlpgFWokksY0YirmSx4VKZBJF6tN7W5bYKJnb48whyZxy8C2eUsEsMzSDPZu+hrHZC43sfD+NwXctP1hCAu4ZG+KBGm4uLrYn7xUdUS6OcMUgUt/i2fyQ9HiJa7gCMwbCjRpvfWXSXpBiPuNkEy7qVogXOuvPuLUf0e; csm-hit=tb:N8HMSFZKE649EDYJ3005+s-63FTSNYM84VWWRPTDZXZ|1750992899388&t:1750992899388&adb:adblk_no; rxc=AFgmL96/P3FKgeN5nE4'
                else:
                    cookie_str='session-id=257-2024891-7405232; session-id-time=2082787201l; i18n-prefs=EUR; lc-acbbe=fr_BE; ubid-acbbe=259-2374101-7381127; session-token="lIsCBwGfDDdeTDh48EPHctstz0RoCBZisu7/yAn+kcxTPNVVxhQso2tkVwf/TIYspUqmnNnzoXzUWhBnfoQoh7UXbR0djWqMxG9mUjJ5P9SGbXV/dIyMKW6XImFraizgbleIs8Tia9e0aB6xK0kJUM2NXTotpWRWJTwzJHsXowTdBAv6gh/6NVrUkqb9Y/qRvmd46YKdVoVdv6YqR4JqV8GLUqdEUeBbrd2CqIHjmY2vCT3JoB55As4b8bEPOSpbcdANvu+a/cUKDOlteDBy1oHH2MBD61v+fBdfMspAGqWZPiJQiUWm8/HkA3xVq0JAsO0HNIC7khARYcq5sl3wsjS/unFGCqHlExOZSvY7Foc="; csm-hit=tb:KVGGZ06607QF8004EYTP+s-V3FSKRRCHCSNXH4NM7EZ|1750994443901&t:1750994443901&adb:adblk_no; rxc=AHCAH1p1m1Xas60/ljk'
                querys = self.queries_asin_queue.get()
                asin = querys
                n = random.randint(116, 120)
                ua = f'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/{n}.0.{random.randint(1000, 5000)}.{random.randint(1, 181)} Safari/537.36'
                headers = {
                    'connection': 'close',
                    'authority': self.host,
                    'accept': 'text/html,*/*',
                    'accept-language': 'zh-CN,zh;q=0.9',
                    'cache-control': 'no-cache',
                    'content-type': 'application/x-www-form-urlencoded;charset=UTF-8',
                    'sec-ch-ua-mobile': '?0',
                    'user-agent': ua
                }
                if asin:
                    headers['origin'] = f'{self.site_url}dp/{asin}'
                    headers['referer'] = f'{self.site_url}dp/{asin}'
                elif self.site_url:
                    headers['origin'] = self.site_url
                    headers['referer'] = self.site_url
                headers["cookie"] = cookie_str
                alphabet = ['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r',
                            's', 't', 'u', 'v', 'w', 'x', 'y', 'z']
                k = ""
                for i in (0, random.randint(0, 26)):
                    k += random.choice(alphabet)
                headers[k] = str(uuid.uuid4())
                if self.headers_num_int > 20:
                    break

                scraper_url = self.site_url + 'dp/' + asin
                self.request_total_count_list.append(4)
                print('scraper_url::', scraper_url)
                try:
                    sess.mount(self.site_url, py_ja3.DESAdapter())
                    resp = sess.get(scraper_url, headers=headers,
                                    timeout=10, verify=False)
                    voer_list_text = ['Voer de tekens in die je hieronder ziet', "Enter the characters you see below",
                                      "Geben Sie die Zeichen unten ein", "Introduce los caracteres que se muestran",
                                      "Saisissez les caractères que vous voyez",
                                      "Inserisci i caratteri visualizzati nello spazio"]
                    for voer in voer_list_text:
                        if voer in resp.text:
                            print('验证码')
                            self.yzm_err_total_list.append(1)
                            self.headers_num_int += 1
                            self.requests_error_asin_list.append(asin)
                            continue
                except Exception as e:
                    self.asin_request_errp_total_list.append(2)
                    print("请求错误错误: ", e)
                    if 'Received response with content-encoding: gzip' in str(e):
                        self.asin_not_found_list.append(asin)
                    else:
                        self.requests_error_asin_list.append(asin)
                    continue
                response_url = resp.url
                response = resp.text
                response_s = etree.HTML(response)
                self.success_asin_total_list.append(3)
                if ("Page Not Found" in response) or ("We are sorry! This Gift Card is not available" in response) or (
                        "500 - An error occurred" in response) or ("Sorry! Something went wrong!" in response):
                    self.asin_not_found_list.append(asin)
                    continue
                if (
                        "How Amazon Pharmacy works" in response and "Sign in to Pharmacy" in response and "About this medication" in response) or (
                        "pharmacy." in response_url) or (
                        "Buy Amazon Coins" in response and "Sold and delivered by ACI Gift Cards LLC, an Amazon company" in response) or (
                        "Youtubers Life" in response and "Become the most successful youtuber on the planet! Create videos" in response):
                    self.asin_not_redirect_list.append(asin)
                    continue
                if ("keywords" in response_url) or ("dp/" not in response_url) or (
                        "ref=" in response_url and "encoding=" in response_url) or (asin not in response_url) or (
                        "ASIN=" in response_url and "ref_=lx_bd" in response_url):
                    self.asin_not_redirect_list.append(asin)
                    continue
                # 获取邮编
                try:
                    ingress = response_s.xpath("//span[@id='glow-ingress-line2']/text()")
                except Exception as e:
                    self.asin_not_response_list.append(asin)
                    continue
                try:
                    ingress = ingress[0].strip()
                except:
                    ingress = None
                print(ingress, ' 打印 邮编 ', resp.url)
                if ingress:
                    if ("Enter the characters you see below" in resp.text) or (
                            "Geben Sie die Zeichen unten ein" in resp.text) or (
                            "Introduce los caracteres que se muestran" in resp.text) or (
                            "Saisissez les caractères que vous voyez" in resp.text) or (
                            "Inserisci i caratteri visualizzati nello spazio" in resp.text):
                        print('验证码')
                        self.requests_error_asin_list.append(asin)
                        continue
                div_dp = response_s.xpath('//div[@id="dp"]')
                if div_dp:
                    title_xpath_list = ['//*[@id="productTitle"]/text()', '//h1[@class="a-size-large"]/text()',
                                        '//h1[@class="a-size-large a-spacing-micro"]/text()',
                                        '//h1[@class="gc-detail-page-title"]//text()',
                                        '//h1[@id="mas-atf-product-title"]/span/span/text()',
                                        '//h1[@class="a2s-title-content"]/text()',
                                        '//span[@id="collection-title"]/text()', '//title[0]/text()',
                                        '//title[@dir="ltr"]/text()',
                                        '//h1/span/text()',
                                        '//h1/span[@id="gc-asin-title"]/text()']
                    title = None
                    for title_xpath in title_xpath_list:
                        title_list = response_s.xpath(title_xpath)
                        if title_list:
                            title = title_list[0].strip()
                            break
                    five_xpath_list = ["//div[@id='feature-bullets']/ul//li/span/text()",
                                       '//div[contains(@id,"productFacts")]//ul//li//span[contains(@class,"a-size-base")]/text()']
                    five_description = None
                    for title_xpath in five_xpath_list:
                        five_text_list = response_s.xpath(title_xpath)
                        if five_text_list:
                            text_list = []
                            for f in five_text_list:
                                if len(f) > 10:
                                    text_list.append(f.strip())
                            if text_list:
                                five_description = '|-|'.join(text_list)
                            else:
                                five_description = None
                            break
                    products_list = ['//div[@data-feature-name="productDescription"]//p//text()',
                                     "//h2[contains(text(),'Product Description')]/following-sibling::div//p/text()"]
                    product_description = None
                    for i in products_list:
                        products_text_list = response_s.xpath(i)
                        if products_text_list:
                            products_text = ''.join(products_text_list).strip()
                            if len(products_text) > 20:
                                product_description = products_text
                                break
                            else:
                                product_description = None
                    items = {'asin': asin, 'title': title, 'five_description': five_description,
                             'product_description': product_description}
                    print(items)
                    self.asin_detail_list.append([asin, title, product_description, five_description, self.site])
                else:
                    self.asin_not_div_id_dp_list.append(asin)
                    continue
            else:
                print(f"当前线程-已完成-爬取-跳出循环")
                break

    def init_list(self):
        print("=======清空变量==========")
        self.asin_variation_list = []  # 变体
        self.asin_not_found_list = []  # 4
        self.asin_not_sure_list = []  # 6
        self.asin_not_foot_list = []  # 7
        self.asin_not_foot2_list = []  # 8
        self.asin_not_buyBox_list = []  # 9
        self.asin_not_response_list = []  # 10
        self.asin_not_redirect_list = []  # 12
        self.asin_not_div_id_dp_list = []  # 13 返回html没有包含div @id=dp,状态13
        self.requests_error_asin_list = []  # 1
        self.asin_list_update = []  # 3
        self.item_queue = Queue()  # 存储 item 详情数据队列
        self.queries_asin_queue = Queue()  # 需要爬取的asin队列
        self.buyBox_list = []  # 卖家名称 url 列表
        self.asin_detail_list = []  # 存储asin 详情的列表
        self.buyBoxname_asin_list = []  # asin 卖家的列表
        self.delete_cookies_list = []  # 存储出现中国邮编的cookie
        self.all_img_video_list = []  # 存储图片 和 图片位置，视频图片
        self.star_list = []
        self.add_cart_asin_list = []  # 存储绑定购买的asin
        self.asin_brand_list = []
        self.bs_category_asin_list = []
        self.bs_category_asin_list_pg = []
        self.headers_num_int = 0
        # 验证码
        self.yzm_err_total_list = []
        # 异常
        self.asin_request_errp_total_list = []
        # 成功
        self.success_asin_total_list = []
        # 总请求
        self.request_total_count_list = []
        # 每小时
        self.hour_total_count_list = []
        self.cookies_queue = Queue()
        self.asin_detail_list = []


    def run(self):
        while True:
            asin_list = self.read_db_data()
            if asin_list:

                if asin_list:
                    for asin in asin_list:
                        self.queries_asin_queue.put(asin)
                    html_thread = []
                    for i in range(15):
                        thread2 = threading.Thread(target=self.get_asin)
                        thread2.start()
                        html_thread.append(thread2)
                    for t2 in html_thread:
                        t2.join()
                print('线程全部结束')
                if self.asin_detail_list:
                    self.save_detail()
                self.init_list()
                break
            else:
                break

    def read_db_data(self):
        while True:
            try:
                self.engine = self.save_asin_detail.engine
                with self.engine.begin() as conn:
                    sql_read = f"SELECT asin, id FROM {self.db_syn} WHERE STATE = 1 and site='{self.site}' limit 300;"
                    print(sql_read)
                    a = conn.execute(sql_read)
                    self.df_read = pd.DataFrame(a, columns=['asin', 'id'])
                    self.df_read.drop_duplicates(['asin'], inplace=True)
                    if self.df_read.shape[0] > 0:
                        self.index_tuple = tuple(self.df_read['id'])
                        # 使用默认值填充空值
                        if len(self.index_tuple) == 1:
                            sql_update = f"""UPDATE {self.db_syn} a set state=2 where a.id in ({self.index_tuple[0]})"""
                        else:
                            sql_update = f"""UPDATE {self.db_syn} a set state=2 where a.id in {self.index_tuple}"""
                        conn.execute(sql_update)
                        asin_list = list(self.df_read.asin)
                        return asin_list
                    else:
                        return []
            except Exception as e:
                time.sleep(random.uniform(10, 20.5))
                self.engine = self.save_asin_detail.engine
                print("读取数据出bug并等待5s继续", e, f"\n{traceback.format_exc()}")
                continue

    def save_detail(self):
        while True:
            try:
                self.engine = self.save_asin_detail.engine
                df_asin_detail = pd.DataFrame(data=self.asin_detail_list,
                                              columns=['asin', 'title', 'describe', 'selling_point', 'site'])
                self.asin_list_update = list(df_asin_detail.asin)
                df_asin_detail.to_sql(f"mx_self_asin_detail", con=self.engine,
                                      if_exists='append',
                                      index=False)
                break
            except Exception as e:
                time.sleep(random.uniform(10, 20.5))
                self.engine = self.save_asin_detail.engine
                print("存储数据出bug并等待5s继续", e, f"\n{traceback.format_exc()}")
                continue
        if self.asin_list_update:
            self.db_change_state(state=3, asin_list=self.asin_list_update)
            self.asin_list_update = []
        if self.asin_not_found_list:
            self.db_change_state(state=4, asin_list=self.asin_not_found_list)
        if self.asin_not_sure_list:
            self.db_change_state(state=6, asin_list=self.asin_not_sure_list)
        if self.asin_not_foot_list:
            self.db_change_state(state=7, asin_list=self.asin_not_foot_list)  # 没有脚
        if self.asin_not_foot2_list:
            self.db_change_state(state=8, asin_list=self.asin_not_foot2_list)
        if self.asin_not_buyBox_list:
            self.db_change_state(state=9, asin_list=self.asin_not_buyBox_list)
        if self.asin_not_response_list:
            self.db_change_state(state=10, asin_list=self.asin_not_response_list)
        if self.asin_not_redirect_list:
            self.db_change_state(state=12, asin_list=self.asin_not_redirect_list)
        if self.asin_not_div_id_dp_list:
            self.db_change_state(state=13, asin_list=self.asin_not_div_id_dp_list)
        if self.requests_error_asin_list:
            if self.site_name == 'us':
                self.db_change_state(state=1, asin_list=self.requests_error_asin_list)
            else:
                self.db_change_state(state=1, asin_list=self.requests_error_asin_list)
    def db_change_state(self, state=2, asin_list=None):
        self.db_change_state_common(state=state, asin_list=asin_list)
        self.asin_not_buyBox_list = []
        self.asin_not_foot_list = []

    def db_change_state_common(self, state=None, asin_list=None):
        print(f"==================== 存储状态 {state} 数据 ========== {len(asin_list)} ========")
        df = self.df_read.loc[self.df_read.asin.isin(asin_list)]
        if state == 3:
            # 剔除状态 7，9 的id
            df = self.df_read.loc[
                (self.df_read.asin.isin(asin_list)) & ~(self.df_read.asin.isin(self.asin_not_foot_list)) & ~(
                    self.df_read.asin.isin(self.asin_not_buyBox_list))]
        id_tuple = tuple(df.id)
        while True:
            try:
                self.engine = self.save_asin_detail.engine
                with self.engine.begin() as conn:
                    # 1,3：1--回滚；3--成功
                    if id_tuple:
                        if len(id_tuple) == 1:
                            sql_update = f"update {self.db_syn} set state={state} where id in ({id_tuple[0]}) and state=2;"
                        else:
                            sql_update = f"update {self.db_syn} set state={state} where id in {id_tuple} and state=2;"
                        conn.execute(sql_update)
                break
            except Exception as e:
                time.sleep(random.uniform(10, 20.5))
                print(f"更改{self.db_syn}表的state={state}出错", e, f"\n{traceback.format_exc()}")
                self.engine = self.save_asin_detail.engine
                time.sleep(15)
                continue


if __name__ == '__main__':
    for i in ['be','nl']:
        async_asin_pg(site_name=i).run()
        time.sleep(3)
