import sys
import os

sys.path.append(os.path.dirname(sys.path[0]))  # 上级目录
from utils.db_connect import BaseUtils
from amazon_params import py_ja3
from utils.requests_param import Requests_param_val
from utils.parse_search_term_xpath import ParseSearchTermUs
from amazon_params.params import DB_SEARCH_TERM_PARAMS_SPIDER
from lxml import etree
from queue import Queue
import time
import random
import re
import json
from curl_cffi import requests, Curl
import pandas as pd
from threading import Lock
import threading
import urllib3
import traceback
import gzip
import requests as requests2
from datetime import datetime

# from amazon_spider.inset_starrcoks_data import send_request

sess = requests2.Session()
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)


class search_temp_pg(BaseUtils):
    def __init__(self, site_name='us', read_size=300, proxy_name=None, week=None, month=None):
        super().__init__()
        self.site_name = site_name  # 站点
        self.month = month
        print(site_name, self.month, 'proxy_name-proxy_name:', proxy_name)
        self.reuests_para_val = Requests_param_val(site_name=self.site_name, spider="search_term",
                                                   proxy_name=proxy_name)
        self.read_size = read_size
        self.cookies_queue = Queue()  # cookie队列
        self.search_term_queue = Queue()  # 需要爬取的asin队列
        self.search_term_priority_queue = Queue()  # 优先抓取队列
        self.search_term_priority_list = []  # 优先抓取列表
        self.pppoe_ip_queue = Queue()
        self.search_term_list = []  # 存放search_term的列表
        self.search_term_not_found = []  # 变狗页面
        self.asin_not_sure_list = []  # 没有关键词相关结果的列表
        # 返回 对应站点的host，首页链接
        self.site_url, self.host = self.reuests_para_val.get_site_url(self.site_name)
        self.search_term_not_next_page_list = []  # 存放找不到下一页关键词的url队列
        self.stop_item_queue = True  # 用于是否退出循环存储的条件
        # 关键词相关
        self.search_term_url_list = []  # 存放search_term+url的列表
        self.search_term_list_update = []  # 存储成功的搜索词
        # 初始化数据库相关参数
        self.week_list = []
        self.db_name_change_common()  # 初始化对应站点的表名 + 初始化站点的周期
        self.df_read = pd.DataFrame()
        self.id_tuple = ()
        self.nums_success = 0  # 每次循环成功的关键词数量
        # 页面解析(zr和sp才有page和page_row)
        self.zr_all_list = []
        self.sp_all_list = []
        self.sb_all_list = []
        self.ac_all_list = []
        self.bs_all_list = []
        self.er_all_list = []
        self.tr_all_list = []
        self.buy_text_list = []
        self.hr_list = []
        self.sort_all_list = []
        self.columns = ['search_term', 'asin', 'page', 'page_row', 'data_type',
                        'title', 'img_url', 'price', 'rating', 'total_comments']
        self.df_asin_detail_simply_list = []
        self.not_sp_url_kw_list = []
        self.nums_no_sp = 0
        self.st_list = []
        self.delete_cookies_list = []  # 存储出现中国邮编的cookie
        self.headers_num_int = 0
        self.headers_num_int_s = 0
        self.search_term_html_queue = Queue()
        # 创建一个队列用于将数据传递给数据库插入函数
        self.insert_queue = Queue()
        self.keyword_html_data_list = []
        # 存储html 主题
        self.search_term_html_topic = f'search_term_html_2025_{self.month}'
        # 存储数据详情
        self.search_term_month_topic = f'{self.site_name}_search_term_month_2025_{self.month}'
        print('存储html 主题:', self.search_term_html_topic)
        print('存储数据详情 主题:', self.search_term_month_topic)

    def db_name_change_common(self):
        self.engine_pg = self.pg_connect()
        self.kafuka_producer = self.kafuka_connect(acks=True)
        self.db_search_term = self.site_name + DB_SEARCH_TERM_PARAMS_SPIDER["db_search_term"][2:] + '_month_syn'
        self.db_search_term_zr = self.site_name + DB_SEARCH_TERM_PARAMS_SPIDER["db_search_term_zr"][2:] + '_month'
        self.db_search_term_sp = self.site_name + DB_SEARCH_TERM_PARAMS_SPIDER["db_search_term_sp"][2:] + '_month'
        self.db_search_term_sb = self.site_name + DB_SEARCH_TERM_PARAMS_SPIDER["db_search_term_sb"][2:] + '_month'
        self.db_search_term_ac = self.site_name + DB_SEARCH_TERM_PARAMS_SPIDER["db_search_term_ac"][2:] + '_month'
        self.db_search_term_bs = self.site_name + DB_SEARCH_TERM_PARAMS_SPIDER["db_search_term_bs"][2:] + '_month'
        self.db_search_term_er = self.site_name + DB_SEARCH_TERM_PARAMS_SPIDER["db_search_term_er"][2:] + '_month'
        self.db_search_term_tr = self.site_name + DB_SEARCH_TERM_PARAMS_SPIDER["db_search_term_tr"][2:] + '_month'
        self.db_search_term_hr = self.site_name + DB_SEARCH_TERM_PARAMS_SPIDER["db_search_term_hr"][2:] + '_month'
        self.db_other_search_term = self.site_name + DB_SEARCH_TERM_PARAMS_SPIDER["db_other_search_term"][2:] + '_month'
        self.db_brand_analytics = self.site_name + DB_SEARCH_TERM_PARAMS_SPIDER[f"us_brand_analytics"][2:] + '_month'

    def get_search_kw(self, t_num):
        while True:
            if self.search_term_queue.empty() == False:
                keywords_scraper_url = self.search_term_queue.get()
                keywords_scraper_url_list = keywords_scraper_url.split('|-|')
                keywords_id = int(keywords_scraper_url_list[0])
                keyword = keywords_scraper_url_list[1]
                scraper_url = keywords_scraper_url_list[2]
                if scraper_url not in self.search_term_not_next_page_list:
                    page = int(re.findall("&page=(\d+)", scraper_url)[0])
                    if self.cookies_queue.empty():
                        cookies_dict = self.reuests_para_val.get_cookie()
                        self.cookie_dict_delete_id = cookies_dict
                        for ck in cookies_dict.values():
                            self.cookies_queue.put(ck)
                    # 获取组装cookie
                    cookie_str = self.reuests_para_val.get_cookie_str(self.cookies_queue)
                    headers = self.reuests_para_val.requests_amazon_headers(host=self.host, site_url=self.site_url,
                                                                            asin=None, scraper_url=scraper_url)
                    headers["cookie"] = cookie_str
                    try:
                        if self.headers_num_int > 120:
                            resp = requests.get(scraper_url, impersonate="chrome110", headers=headers,
                                                timeout=10, verify=False)
                        else:
                            sess.mount(self.site_url, py_ja3.DESAdapter())
                            resp = sess.get(scraper_url, headers=headers,
                                            timeout=10, verify=False)
                        if self.reuests_para_val.check_amazon_yzm(resp):
                            self.search_term_priority_list.append(keywords_id)
                            self.headers_num_int += 1
                            continue
                    except Exception as e:
                        print('请求报错：', e)
                        self.search_term_priority_list.append(keywords_id)
                        continue
                    response = resp.text
                    etree_html = etree.HTML(response)
                    try:
                        ingress = etree_html.xpath("//span[@id='glow-ingress-line2']/text()")
                        print("**************  邮编  ***************", ingress)
                        ingress = ingress[0].strip()
                    except:
                        ingress = None
                        print("获取邮编錯誤:")
                    print('***** 邮编: ', ingress, "关键词：", keyword, '  请求url: ', scraper_url)
                    if ingress:
                        if self.reuests_para_val.check_amazon_ingress(ingress):
                            self.search_term_priority_list.append(keywords_id)
                            if self.site_name != 'es' or self.site_name != 'it':
                                try:
                                    cookie_ubid_main_id = re.findall(r'ubid-main=(.*?);', cookie_str)[0]
                                except:
                                    cookie_ubid_main_id = re.findall(r'session-id=(.*?);', cookie_str)[0]
                                for cookie_key_value in self.cookie_dict_delete_id.items():
                                    if cookie_ubid_main_id in cookie_key_value[1]:
                                        self.delete_cookies_list.append(cookie_key_value[0])
                            continue
                    else:
                        print("没有获取到邮编，", keyword, scraper_url)
                        self.search_term_priority_list.append(keywords_id)
                        self.headers_num_int += 1
                        continue
                    if (self.reuests_para_val.check_amazon_not_page(response)) and (
                            '404' not in keyword and 'page not found' not in keyword):
                        print("***** Page Not Found  关键词搜索出现问题：", keyword, scraper_url)
                        self.search_term_priority_list.append(keywords_id)
                        # self.search_term_not_found.extend([keywords_id])
                        time.sleep(random.uniform(2.2, 4.5))
                        continue
                    # 判断是否有下一页
                    lock = Lock()
                    lock.acquire()
                    no_results = 'No results for'
                    if no_results in response:
                        self.asin_not_sure_list.extend([keywords_id])
                        lock.release()
                    num = random.randint(1, 100)
                    if num < 10:
                        new_date = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
                        response_gzip = self.compress_string(response)
                        md5_hex_digest = self.reuests_para_val.hex_md5(keyword)
                        html_data = f'{md5_hex_digest}|-||=|-|=||-|{self.site_name}|-||=|-|=||-|{keyword}|-||=|-|=||-|{response_gzip}|-||=|-|=||-|{new_date}|-||=|-|=||-|{page}'
                        self.send_kafka_html(html_data=html_data)
                    self.parse_html_page(response, keyword, scraper_url, page)
            else:
                print(f"当前线程-{t_num} 已完成-爬取-跳出循环")
                break

    # 压缩字符串
    def compress_string(self, input_string):
        return gzip.compress(input_string.encode())

    # 解压缩字符串
    def decompress_string(self, input_bytes):
        # 在Python中，字节对象和它们的字符串表示形式是不同的。例如，字节对象 b'hello' 和字符串 'b'hello'' 是不同的，尽管它们看起来很相似。
        # 当你从数据库中读取压缩的数据时，如果数据被存储为二进制格式，那么你需要以二进制形式读取它，然后解压缩。但是，如果数据被存储为字符串（
        # 可能是因为数据库的限制或其他原因），那么你需要先将字符串转换为字节对象，然后再解压缩。
        import ast
        if isinstance(input_bytes, str):
            # html_str = json.loads(html_str)
            input_bytes = ast.literal_eval(input_bytes)
        return gzip.decompress(input_bytes).decode('utf-8')

    def init_list(self):
        print("=======清空变量==========")
        self.item_queue = Queue()  # 存储 item 详情数据队列
        self.search_term_queue = Queue()  # 需要爬取的asin队列
        self.buyBox_list = []  # 卖家名称 url 列表
        self.delete_cookies_list = []  # 存储出现中国邮编的cookie
        self.search_term_not_found = []  # 没有关键词相关结果的列表
        self.asin_not_sure_list = []  # 没有关键词相关结果的列表
        self.search_term_not_next_page_list = []  # 存放找不到下一页关键词的url队列
        self.search_term_url_list = []  # 存储要爬取的url
        self.zr_all_list = []
        self.sp_all_list = []
        self.sb_all_list = []
        self.ac_all_list = []
        self.bs_all_list = []
        self.er_all_list = []
        self.tr_all_list = []
        self.buy_text_list = []
        self.hr_list = []
        self.sort_all_list = []
        self.df_asin_detail_simply_list = []
        self.cookies_list = []
        self.search_term_list = []  # 存放search_term的列表
        self.headers_num_int = 0
        self.search_term_html_queue = Queue()
        # 创建一个队列用于将数据传递给数据库插入函数
        self.insert_queue = Queue()
        self.keyword_html_data_list = []
        self.kafuka_producer.close()

    def run_pol(self):
        search_term_list = self.db_read_data_common()
        if search_term_list:
            if self.cookies_queue.empty():
                cookies_dict = self.reuests_para_val.get_cookie()
                self.cookie_dict_delete_id = cookies_dict
                for ck in cookies_dict.values():
                    self.cookies_queue.put(ck)
            for search_url in search_term_list:
                self.search_term_queue.put(search_url)
            html_thread = []
            for i in range(16):
                thread2 = threading.Thread(target=self.get_search_kw, args=(i,))
                html_thread.append(thread2)
            for ti in html_thread:
                ti.start()
            for t2 in html_thread:
                t2.join()
            print('最后刷新kafka flush')
            self.kafuka_producer.flush()
            print('当前线程抓取结束')
            # 存储数据
            print("存储数据")
            self.db_save_data()
            self.db_change_state()
            # 删除cookie
            print("删除cookie：", len(self.delete_cookies_list))
            self.reuests_para_val.delete_china_cookie(list(set(self.delete_cookies_list)))
            # 清空变量，
            self.init_list()

    def parse_html_page(self, response=None, keywords=None, scraper_url=None, page=None):
        parse_search_term = ParseSearchTermUs(page_source=response, driver=None, search_term=keywords,
                                              page=page, site_name=self.site_name)
        st_list = parse_search_term.run()
        zr_list, sp_list, sb_list, ac_list, bs_list, er_list, tr_list, sort_list, buy_text_list, hr_list = st_list
        self.st_list.append(st_list)
        if (len(sp_list) == 0) and (self.search_term_queue.empty() == False):
            self.nums_no_sp += 1
            self.not_sp_url_kw_list.append((keywords, scraper_url))
            if self.nums_no_sp > 3:
                print("************** 没有sq：", self.not_sp_url_kw_list)
                for st_url_queue in self.not_sp_url_kw_list:
                    self.search_term_queue.put(st_url_queue)

        for st_list in self.st_list:
            zr_list, sp_list, sb_list, ac_list, bs_list, er_list, tr_list, sort_list, buy_text_list, hr_list = st_list
            self.zr_all_list.extend(zr_list)
            self.sp_all_list.extend(sp_list)
            self.sb_all_list.extend(sb_list)
            self.ac_all_list.extend(ac_list)
            self.bs_all_list.extend(bs_list)
            self.er_all_list.extend(er_list)
            self.tr_all_list.extend(tr_list)
            self.buy_text_list.extend(buy_text_list)
            self.hr_list.extend(hr_list)
            if parse_search_term.page == 1:
                self.sort_all_list.extend(sort_list)
        self.not_sp_url_kw_list = []
        self.st_list = []
        self.nums_no_sp = 0

    def db_read_data_common(self):
        while True:
            try:
                self.engine_pg = self.pg_connect()
                sql_read = f"""SELECT id, search_term, url FROM {self.db_search_term} where state=1 and month={self.month} LIMIT {self.read_size} for update;"""
                print(sql_read)
                self.df_read = self.engine.read_sql(sql_read)
                if self.df_read.shape[0] > 0:
                    self.id_tuple = tuple(self.df_read.id)
                    self.date_info = f'2025-{self.month}'
                    print('date_info::', self.date_info, ' 月：', self.month)
                    with self.engine_pg.begin() as conn:
                        if len(self.id_tuple) == 1:
                            sql_update = f'UPDATE {self.db_search_term} set state=2 where id in ({self.id_tuple[0]});'
                        else:
                            sql_update = f'UPDATE {self.db_search_term} set state=2 where id in {self.id_tuple};'
                        conn.execute(sql_update)
                    search_term_list = list(
                        self.df_read.id.astype("U") + '|-|' + self.df_read.search_term + '|-|' + self.df_read.url)
                    return search_term_list
                else:
                    self.stop_item_queue = False
                    return []
            except Exception as e:
                time.sleep(random.uniform(5, 17.5))
                self.read_size = 100
                print("读取数据出bug并等待5s继续", e)
                continue

    def db_change_state(self):
        self.db_change_state_common(3, [])
        if self.search_term_not_found:
            self.db_change_state_common(4, list(set(self.search_term_not_found)))
        if self.asin_not_sure_list:
            self.db_change_state_common(6, list(set(self.asin_not_sure_list)))
        if self.search_term_priority_list:
            self.db_change_state_common(1, list(set(self.search_term_priority_list)))

    def db_change_state_common(self, state, search_term_list):
        if state == 3:
            df = self.df_read.loc[~self.df_read.id.isin(self.search_term_priority_list)] # 找出id不再search_term_list里面的修改状态 3
            id_tuple = tuple(df.id)
        else:
            df = self.df_read.loc[self.df_read.id.isin(search_term_list)] # 找出id再search_term_list里面的修改状态 1
            id_tuple = tuple(df.id)
        print(f"== 存储状态 {state} 数据 ========== {len(id_tuple)} ========")
        while True:
            try:
                self.engine_pg = self.pg_connect()
                with self.engine_pg.begin() as conn:
                    if id_tuple:
                        if len(id_tuple) == 1:
                            sql_update = f"update {self.db_search_term} set state={state} where id in ({id_tuple[0]});"
                        else:
                            sql_update = f"update {self.db_search_term} set state={state} where id in {id_tuple};"
                        conn.execute(sql_update)
                break
            except Exception as e:
                print(f"更改{self.db_search_term}表的state={state}出错", e, f"\n{traceback.format_exc()}")
                time.sleep(15)
                continue

    def db_save_common(self, cate_type=None, data_list=None, db_name=None):
        if data_list:
            print(db_name, "存储详细数据：", cate_type, len(data_list))
            """
            仅仅当传过来的data_list不为空的情况下，才会执行下面的数据
            """
            # # if self.site_name != 'us':
            # new_date = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
            # if len(data_list) > 1000:
            #     if cate_type in ['zr']:
            #         # 计算分割点
            #         split_point = len(data_list) // 20
            #     else:
            #         # 计算分割点
            #         split_point = len(data_list) // 10
            #     # 使用切片分割列表
            #     split_data = [data_list[i:i + split_point] for i in range(0, len(data_list), split_point)]
            #     # 打印结果
            #     for i, sublist in enumerate(split_data, 1):
            #         print(f'{cate_type}  sublist:', len(sublist))
            #         item = {"cate_type": cate_type, "data_list": sublist, 'date_info': self.date_info,
            #                 'spider_time': new_date}
            #         self.send_kafka(items=item)
            # else:
            #     item = {"cate_type": cate_type, "data_list": data_list, 'date_info': self.date_info,
            #             'spider_time': new_date}
            #     self.send_kafka(items=item)

            # else:
            #########  以下注释是正常us抓取入库。###########################
            if cate_type in ['buy']:
                df = pd.DataFrame(data=data_list, columns=['search_term', 'asin', 'page', 'buy_data', 'label','asin_brand'])
                df.label = df.label.apply(lambda x: str(x)[:200] if x is not None else None)  # 截取字符
                df.buy_data = df.buy_data.apply(lambda x: str(x)[:200] if x is not None else None)  # 截取字符
            else:
                df = pd.DataFrame(data=data_list, columns=self.columns)
            df['date_info'] = self.date_info
            if cate_type in ['zr', 'sp']:
                df = df.loc[:, ['search_term', 'asin', 'page', 'page_row', 'date_info']]
                df.drop_duplicates(['search_term', 'asin', 'page', 'page_row'], inplace=True)
            elif cate_type in ['buy']:
                df = df.loc[:, ['search_term', 'asin', 'page', 'buy_data', 'date_info', 'label']]
                df.drop_duplicates(['search_term', 'asin', 'page', 'buy_data', 'label'], inplace=True)
            else:
                if cate_type in ['sb', 'tr']:
                    df = df.loc[:, ['search_term', 'asin', 'page', 'data_type', 'date_info']]
                    df.drop_duplicates(['search_term', 'asin', 'page', 'data_type'], inplace=True)
                elif cate_type in ['buy']:
                    df = df.loc[:, ['search_term', 'asin', 'page', 'buy_data', 'date_info', 'label','asin_brand']]
                    df.drop_duplicates(['search_term', 'asin', 'page', 'buy_data', 'label'], inplace=True)
                    df.label = df.label.apply(lambda x: str(x)[:200] if x is not None else None)  # 截取字符
                    df.buy_data = df.buy_data.apply(lambda x: str(x)[:200] if x is not None else None)  # 截取字符
                    df.asin_brand = df.asin_brand.apply(lambda x: str(x)[:200] if x is not None else None)  # 截取字符
                else:
                    df = df.loc[:, ['search_term', 'asin', 'page', 'date_info']]
                    df.drop_duplicates(['search_term', 'asin', 'page'], inplace=True)
            while True:
                try:
                    self.engine_pg = self.pg_connect()
                    if df.shape[0] > 0:
                        print("db_name:", db_name)
                        df['asin'] = df['asin'].str.replace('/', '')
                        self.engine_pg.to_sql(df, db_name, if_exists="append")
                    break
                except Exception as e:
                    print(e, f"\n{traceback.format_exc()}")
                    time.sleep(5)
                    continue

    def db_update_brand(self):
        if self.sort_all_list:
            while True:
                try:
                    self.engine_pg = self.pg_connect()
                    df_being_sold = pd.DataFrame(data=self.sort_all_list,
                                                 columns=['search_term', 'quantity_being_sold',
                                                          'quantity_being_sold_str', 'result_count','departments'])
                    # 获取成功抓取的搜索词来更改状态 3
                    df_being_sold['month'] = self.month
                    df_being_sold['date_info'] = self.date_info
                    year_moth_list = self.date_info.split('-')
                    print('year_moth_list::,', year_moth_list)
                    print(f'存储表：：{self.db_brand_analytics}_{year_moth_list[0]}')
                    print(len(self.sort_all_list))
                    df_being_sold.drop_duplicates(['search_term', 'quantity_being_sold'], inplace=True)  # 去重
                    if df_being_sold.shape[0] > 0:
                        self.engine_pg.to_sql(df_being_sold,f'{self.db_brand_analytics}_{year_moth_list[0]}',
                                             if_exists='append')
                    break
                except Exception as e:
                    print('db_update_brand::', e, f"\n{traceback.format_exc()}")
                    time.sleep(5)
                    continue

    def on_send_success(self, record_metadata):
        # print(f"消息发送成功: {record_metadata.topic}-{record_metadata.partition}-{record_metadata.offset}")
        pass

    def on_send_error(self, excp):
        print("消息发送失败", excp)

    def send_kafka_html(self, html_data=None, items=None):
        for i in range(10):
            try:
                if html_data:
                    future = self.kafuka_producer.send(self.search_term_html_topic, html_data)
                    future.add_callback(self.on_send_success).add_errback(self.on_send_error)
                    print(f"{i}发送中")
                    future.get(8)  # 阻塞直到发送成功或超时
                    print(f"{i}发送成功")
                    self.kafuka_producer.flush()
                break
            except Exception as e:
                if i > 0 and i % 2 == 0:
                    self.kafuka_producer = self.kafuka_connect(acks=True)
                elif i >= 9:
                    print("搜索词上传html报错", e, f"\n{traceback.format_exc()}")

    def db_save_data(self):
        print('=========================  准备存储   ============================')
        if self.zr_all_list or self.sp_all_list:
            year_moth = self.date_info.replace('-', '_')
            self.db_save_common(cate_type='zr', data_list=self.zr_all_list,
                                db_name=f"{self.db_search_term_zr}_{year_moth}")
            self.db_save_common(cate_type='sp', data_list=self.sp_all_list,
                                db_name=f"{self.db_search_term_sp}_{year_moth}")
            self.db_save_common(cate_type='sb', data_list=self.sb_all_list,
                                db_name=f"{self.db_search_term_sb}_{year_moth}")
            self.db_save_common(cate_type='ac', data_list=self.ac_all_list,
                                db_name=f"{self.db_search_term_ac}_{year_moth}")
            self.db_save_common(cate_type='bs', data_list=self.bs_all_list,
                                db_name=f"{self.db_search_term_bs}_{year_moth}")
            self.db_save_common(cate_type='er', data_list=self.er_all_list,
                                db_name=f"{self.db_search_term_er}_{year_moth}")
            self.db_save_common(cate_type='tr', data_list=self.tr_all_list,
                                db_name=f"{self.db_search_term_tr}_{year_moth}")
            self.db_save_common(cate_type='buy', data_list=self.buy_text_list,
                                db_name=f"{self.db_other_search_term}_{year_moth}")
            self.db_save_common(cate_type='hr', data_list=self.hr_list,
                                db_name=f"{self.db_search_term_hr}_{year_moth}")
            self.db_update_brand()
