import sys
import os
import numpy as np

sys.path.append(os.path.dirname(sys.path[0]))  # 上级目录
from utils.db_connect import BaseUtils
from utils.requests_param import Requests_param_val
from utils.parse_search_term_xpath import ParseSearchTermUs
from amazon_params.params import DB_SEARCH_TERM_PARAMS_SPIDER_MYSQL
from lxml import etree
from queue import Queue
import time
import random
import re
import json
import pandas as pd
from threading import Lock
import threading
import urllib3
import traceback
import uuid
import requests as requests2
sess = requests2.Session()
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
from curl_cffi import requests
from amazon_params import py_ja3
from datetime import datetime


class search_temp_mysql(BaseUtils):
    def __init__(self, site_name=None, read_size=300, proxy_name=None, week=None, month=None):
        super().__init__()
        self.site_name = site_name  # 站点
        print('代理ip  proxy_name:', proxy_name)
        self.reuests_para_val = Requests_param_val(site_name=self.site_name, spider="search_term",
                                                   proxy_name=proxy_name)
        self.week = week
        self.engine_pg = self.pg_connect()
        self.read_size = read_size
        self.cookies_queue = Queue()  # cookie队列
        self.search_term_queue = Queue()  # 需要爬取的asin队列
        self.search_term_priority_queue = Queue()  # 优先抓取队列
        self.search_term_priority_list = []  # 优先抓取列表
        self.requests_error_keyword_list = []  # 存储请求失败的关键词
        self.search_term_list = []  # 存放search_term的列表
        self.search_term_not_found = []  # 没有关键词相关结果的列表
        # 返回 对应站点的host，首页链接
        self.site_url, self.host = self.reuests_para_val.get_site_url(self.site_name)
        self.search_term_not_next_page_list = []  # 存放找不到下一页关键词的url队列
        self.stop_item_queue = True  # 用于是否退出循环存储的条件
        self.thread_nums = 1
        # 关键词相关
        self.search_term_url_list = []  # 存放search_term+url的列表
        self.search_term_list_update = []  # 存储成功的搜索词
        self.search_term_failed = []  # 记录线程失败的关键词列表
        # 初始化数据库相关参数
        self.db_name_change_common()  # 初始化对应站点的表名 + 初始化站点的周期
        self.df_read = pd.DataFrame()
        self.id_tuple = ()
        self.nums_delete = 20
        self.nums_success = 0  # 每次循环成功的关键词数量
        self.nums_fail = 0  # 记录抓取的所有失败数量
        self.time_seconds = 0  # 每次循环所使用的时间
        # 页面解析(zr和sp才有page和page_row)
        self.zr_all_list = []
        self.sp_all_list = []
        self.sb_all_list = []
        self.ac_all_list = []
        self.bs_all_list = []
        self.er_all_list = []
        self.hr_list = []
        self.tr_all_list = []
        self.sort_all_list = []
        self.buy_text_list = []
        self.columns = ['search_term', 'asin', 'page', 'page_row', 'cate_type',
                        'title', 'img_url', 'price', 'rating', 'total_comments']
        self.df_asin_detail_simply_list = []
        self.not_sp_url_kw_list = []
        self.nums_no_sp = 0
        self.st_list = []
        self.delete_cookies_list = []  # 存储出现中国邮编的cookie
        self.headers_num_int = 0
        self.headers_num_int_s = 0

    def db_name_change_common(self):
        self.kafuka_producer = self.kafuka_connect()
        self.db_search_term = self.site_name + DB_SEARCH_TERM_PARAMS_SPIDER_MYSQL["db_search_term"][2:] + '_syn'
        self.db_search_term_zr = self.site_name + DB_SEARCH_TERM_PARAMS_SPIDER_MYSQL["db_search_term_zr"][2:]
        self.db_search_term_sp = self.site_name + DB_SEARCH_TERM_PARAMS_SPIDER_MYSQL["db_search_term_sp"][2:]
        self.db_search_term_sb = self.site_name + DB_SEARCH_TERM_PARAMS_SPIDER_MYSQL["db_search_term_sb"][2:]
        self.db_search_term_ac = self.site_name + DB_SEARCH_TERM_PARAMS_SPIDER_MYSQL["db_search_term_ac"][2:]
        self.db_search_term_bs = self.site_name + DB_SEARCH_TERM_PARAMS_SPIDER_MYSQL["db_search_term_bs"][2:]
        self.db_search_term_er = self.site_name + DB_SEARCH_TERM_PARAMS_SPIDER_MYSQL["db_search_term_er"][2:]
        self.db_search_term_tr = self.site_name + DB_SEARCH_TERM_PARAMS_SPIDER_MYSQL["db_search_term_tr"][2:]
        self.db_search_term_hr = self.site_name + DB_SEARCH_TERM_PARAMS_SPIDER_MYSQL["db_search_term_hr"][2:]
        self.db_other_search_term = self.site_name + DB_SEARCH_TERM_PARAMS_SPIDER_MYSQL["db_other_search_term"][2:]
        self.db_brand_analytics = self.site_name + DB_SEARCH_TERM_PARAMS_SPIDER_MYSQL["us_brand_analytics"][2:]

    def get_search_kw(self, t_num):
        while True:
            if self.search_term_queue.empty() == False:
                keywords_scraper_url = self.search_term_queue.get()
                keywords_scraper_url_list = keywords_scraper_url.split('|-|')
                keywords_id = int(keywords_scraper_url_list[0])
                keyword = keywords_scraper_url_list[1]
                scraper_url = keywords_scraper_url_list[2]
                if scraper_url not in self.search_term_not_next_page_list:
                    print("keywords, scraper_url:", keywords_id, keyword, scraper_url)
                    page = int(re.findall("&page=(\d+)", scraper_url)[0])
                    print("当前抓取week:", self.week)
                    print(f"cookie实时数量： {self.cookies_queue.qsize()}")
                    if self.cookies_queue.empty():
                        cookies_dict = self.reuests_para_val.get_cookie()
                        self.cookie_dict_delete_id = cookies_dict
                        for ck in cookies_dict.values():
                            self.cookies_queue.put(ck)
                    # 获取组装cookie
                    cookie_str = self.reuests_para_val.get_cookie_str(self.cookies_queue)
                    headers = self.reuests_para_val.requests_amazon_headers(host=self.host, site_url=self.site_url,
                                                                            asin=None, scraper_url=scraper_url)
                    headers["cookie"] = cookie_str
                    print(f"{self.headers_num_int} 当前请求 {keyword} 第  {page} 页 ", scraper_url)
                    try:
                        if self.headers_num_int > 120:
                            resp = requests.get(scraper_url, impersonate="chrome120", headers=headers,
                                            timeout=10, verify=False)
                        else:
                            sess.mount(self.site_url, py_ja3.DESAdapter())
                            resp = sess.get(scraper_url, headers=headers,
                                            timeout=10, verify=False)
                        if self.reuests_para_val.check_amazon_yzm(resp):
                            print(f"{self.site_name}  站点  +   使用代理ip出现验证码：{scraper_url}")
                            self.search_term_priority_list.append(keywords_id)
                            time.sleep(random.uniform(3.2, 4.5))
                            self.headers_num_int += 1
                            continue
                    except Exception as e:
                        print('请求报错：', e)
                        self.search_term_priority_list.append(keywords_id)
                        time.sleep(random.uniform(3.2, 4.5))
                        continue
                    response = resp.text
                    etree_html = etree.HTML(response)
                    try:
                        ingress = etree_html.xpath("//span[@id='glow-ingress-line2']/text()")
                        print("**************  邮编  ***************", ingress)
                        ingress = ingress[0].strip()
                    except:
                        ingress = None
                        print("获取邮编錯誤:")
                    print('*** 邮编: ', ingress, "关键词：", keyword, '  请求url: ', scraper_url)
                    if ingress:
                        if self.reuests_para_val.check_amazon_ingress(ingress):
                            print("************ 邮编出现问题：************")
                            self.search_term_priority_list.append(keywords_id)
                            try:
                                cookie_ubid_main_id = re.findall(r'ubid-main=(.*?);', cookie_str)[0]
                            except:
                                cookie_ubid_main_id = re.findall(r'session-id=(.*?);', cookie_str)[0]
                            for cookie_key_value in self.cookie_dict_delete_id.items():
                                if cookie_ubid_main_id in cookie_key_value[1]:
                                    self.delete_cookies_list.append(cookie_key_value[0])
                            self.headers_num_int += 1
                            continue
                    else:
                        print("没有获取到邮编，", keyword, scraper_url)
                        self.search_term_priority_list.append(keywords_id)
                        self.headers_num_int += 1
                        continue
                    if self.reuests_para_val.check_amazon_not_page(response) and ingress is None:
                        print("*** Page Not Found  关键词搜索出现问题：", keyword, scraper_url)
                        self.search_term_priority_list.append(keywords_id)
                        time.sleep(random.uniform(1.2, 4.5))
                        continue

                    # 判断是否有下一页
                    lock = Lock()
                    lock.acquire()
                    no_results = 'No results for'
                    if no_results in response:
                        self.search_term_not_found.extend([keywords_id])
                        lock.release()
                    self.parse_html_page(response, keyword, scraper_url, page)
            else:
                print(f"当前线程-{t_num} 已完成-爬取-跳出循环")
                break

    def init_list(self):
        print("=======清空变量==========")
        self.item_queue = Queue()  # 存储 item 详情数据队列
        self.search_term_queue = Queue()  # 需要爬取的asin队列
        self.buyBox_list = []  # 卖家名称 url 列表
        self.asin_detail_list = []  # 存储asin 详情的列表
        self.buyBoxname_asin_list = []  # asin 卖家的列表
        self.delete_cookies_list = []  # 存储出现中国邮编的cookie
        self.requests_error_keyword_list = []  # 请求失败的关键词
        self.search_term_not_found = []  # 没有关键词相关结果的列表
        self.search_term_not_next_page_list = []  # 存放找不到下一页关键词的url队列
        self.search_term_url_list = []  # 存储要爬取的url
        self.zr_all_list = []
        self.sp_all_list = []
        self.sb_all_list = []
        self.ac_all_list = []
        self.bs_all_list = []
        self.er_all_list = []
        self.tr_all_list = []
        self.hr_list = []
        self.buy_text_list = []
        self.sort_all_list = []
        self.df_asin_detail_simply_list = []
        self.cookies_list = []
        self.search_term_list = []  # 存放search_term的列表
        self.headers_num_int = 0
        self.headers_num_int_s = 0
        self.not_sp_url_kw_list = []

    def run_pol(self):
        search_term_list = self.db_read_data_common()
        if search_term_list:
            if self.cookies_queue.empty():
                cookies_dict = self.reuests_para_val.get_cookie()
                self.cookie_dict_delete_id = cookies_dict
                for ck in cookies_dict.values():
                    self.cookies_queue.put(ck)
            for search_url in search_term_list:
                self.search_term_queue.put(search_url)
            html_thread = []
            for i in range(17):
                thread2 = threading.Thread(target=self.get_search_kw, args=(i,))
                html_thread.append(thread2)
            for ti in html_thread:
                ti.start()
            for t2 in html_thread:
                t2.join()
            print("存储数据")
            self.db_save_data()
            self.db_change_state()
            # 删除cookie
            print("删除cookie：", len(list(set(self.delete_cookies_list))))
            self.reuests_para_val.delete_china_cookie(list(set(self.delete_cookies_list)))
            # 清空变量，
            self.init_list()


    def parse_html_page(self, response=None, keywords=None, scraper_url=None, page=None):
        parse_search_term = ParseSearchTermUs(page_source=response, driver=None, search_term=keywords,
                                              page=page, site_name=self.site_name)
        st_list = parse_search_term.run()
        zr_list, sp_list, sb_list, ac_list, bs_list, er_list, tr_list, sort_list, buy_text_list, hr_list = st_list
        self.st_list.append(st_list)
        if (len(sp_list) == 0) and (self.search_term_queue.empty() == False):
            self.nums_no_sp += 1
            self.not_sp_url_kw_list.append((keywords, scraper_url))
            if self.nums_no_sp > 3:
                print("************** 没有sq：", self.not_sp_url_kw_list)
                for st_url_queue in self.not_sp_url_kw_list:
                    self.search_term_queue.put(st_url_queue)

        for st_list in self.st_list:
            zr_list, sp_list, sb_list, ac_list, bs_list, er_list, tr_list, sort_list, buy_text_list, hr_list = st_list
            self.zr_all_list.extend(zr_list)
            self.sp_all_list.extend(sp_list)
            self.sb_all_list.extend(sb_list)
            self.ac_all_list.extend(ac_list)
            self.bs_all_list.extend(bs_list)
            self.er_all_list.extend(er_list)
            self.tr_all_list.extend(tr_list)
            self.buy_text_list.extend(buy_text_list)
            self.hr_list.extend(hr_list)
            if parse_search_term.page == 1:
                self.sort_all_list.extend(sort_list)
        self.not_sp_url_kw_list = []
        self.st_list = []
        self.nums_no_sp = 0

    def db_read_data_common(self):
        while True:
            try:
                self.engine_pg = self.pg_connect()
                with self.engine_pg.begin() as conn:
                    sql_read = f'SELECT id, search_term, url FROM {self.db_search_term} where state=1 and week={int(self.week)} ORDER BY id LIMIT {self.read_size} for update;'
                    print(sql_read)
                    a = conn.execute(sql_read)
                    self.df_read = pd.DataFrame(a, columns=['id', 'search_term', 'url'])
                    if int(self.week) < 10:
                        _week = f'0{int(self.week)}'
                    else:
                        _week = self.week
                    self.date_info = f'2024-{_week}'
                    print('date_info::', self.date_info, ' 周：', self.week)
                    if self.df_read.shape[0] > 0:
                        self.id_tuple = tuple(self.df_read.id)
                        if len(self.id_tuple) == 1:
                            sql_update = f'UPDATE {self.db_search_term} set state=2 where id in ({self.id_tuple[0]});'
                        else:
                            sql_update = f'UPDATE {self.db_search_term} set state=2 where id in {self.id_tuple};'
                        conn.execute(sql_update)
                        search_term_list = list(
                            self.df_read.id.astype("U") + '|-|' + self.df_read.search_term + '|-|' + self.df_read.url)
                        return search_term_list
                    else:
                        self.stop_item_queue = False
                        return []
            except Exception as e:
                time.sleep(random.uniform(13.5, 37.5))
                self.read_size = 100
                print("读取数据出bug并等待5s继续", e)
                continue

    def db_change_state(self):
        self.db_change_state_common(3, [])
        if self.search_term_not_found:
            self.db_change_state_common(4, list(set(self.search_term_not_found)))
        if self.search_term_priority_list:
            self.db_change_state_common(1, list(set(self.search_term_priority_list)))

    def db_change_state_common(self, state, search_term_list):
        if state == 3:
            id_tuple = self.id_tuple
        else:
            df = self.df_read.loc[self.df_read.id.isin(search_term_list)]
            id_tuple = tuple(df.id)
        print(f"== 存储状态 {state} 数据 ========== {len(id_tuple)} ========")
        while True:
            try:
                self.engine_pg = self.pg_connect()
                with self.engine_pg.begin() as conn:
                    if id_tuple:
                        if len(id_tuple) == 1:
                            sql_update = f"update {self.db_search_term} set state={state} where id in ({id_tuple[0]});"
                        else:
                            sql_update = f"update {self.db_search_term} set state={state} where id in {id_tuple};"
                        conn.execute(sql_update)
                break
            except Exception as e:
                print(f"更改{self.db_search_term}表的state={state}出错", e, f"\n{traceback.format_exc()}")
                time.sleep(15)
                continue

    def db_save_common(self, cate_type=None, data_list=None, db_name=None):
        if data_list:
            print("存储详细数据：", cate_type, len(data_list))
            """
            仅仅当传过来的data_list不为空的情况下，才会执行下面的数据
            """

            new_date = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
            if len(data_list) > 1000:
                if cate_type in ['zr']:
                    # 计算分割点
                    split_point = len(data_list) // 20
                else:
                    # 计算分割点
                    split_point = len(data_list) // 10
                # 使用切片分割列表
                split_data = [data_list[i:i + split_point] for i in range(0, len(data_list), split_point)]
                # 打印结果
                for i, sublist in enumerate(split_data, 1):
                    print(f'{cate_type}  sublist:', len(sublist))
                    item = {"cate_type": cate_type, "data_list": sublist, 'date_info': self.date_info,
                            'spider_time': new_date}
                    self.send_kafka(item)
            else:
                item = {"cate_type": cate_type, "data_list": data_list, 'date_info': self.date_info,
                        'spider_time': new_date}
                self.send_kafka(item)


            # else:
            #     if cate_type in ['buy']:
            #         df = pd.DataFrame(data=data_list, columns=['search_term', 'asin', 'page', 'buy_data', 'label'])
            #     else:
            #         df = pd.DataFrame(data=data_list, columns=self.columns)
            #     df['date_info'] = self.date_info
            #     if cate_type in ['zr', 'sp']:
            #         df = df.loc[:, ['search_term', 'asin', 'page', 'page_row', 'date_info']]
            #         df.drop_duplicates(['search_term', 'asin', 'page', 'page_row'], inplace=True)
            #     else:
            #         if cate_type in ['sb', 'tr']:
            #             df = df.loc[:, ['search_term', 'asin', 'page', 'cate_type', 'date_info']]
            #             df.rename(columns={"cate_type": "data_type"}, inplace=True)
            #             df.drop_duplicates(['search_term', 'asin', 'page', 'data_type'], inplace=True)
            #         elif cate_type in ['buy']:
            #             df = df.loc[:, ['search_term', 'asin', 'page', 'buy_data', 'date_info', 'label']]
            #             df.drop_duplicates(['search_term', 'asin', 'page', 'buy_data', 'label'], inplace=True)
            #         else:
            #             df = df.loc[:, ['search_term', 'asin', 'page', 'date_info']]
            #             df.drop_duplicates(['search_term', 'asin', 'page'], inplace=True)
            #     while True:
            #         try:
            #             self.engine_pg = self.pg_connect()
            #             if df.shape[0] > 0:
            #                 print("db_name:", db_name)
            #                 #             # two_dimensional_list = df.values.tolist()
            #                 #             # if cate_type == 'zr':
            #                 #             #     # 转换为 NumPy 数组
            #                 #             #     np_array = np.array(two_dimensional_list)
            #                 #             #     # 获取数组的行数
            #                 #             #     num_rows = np_array.shape[0]
            #                 #             #     # 计算分割点
            #                 #             #     split_point = num_rows // 2
            #                 #             #     # 分割数组
            #                 #             #     first_half = np_array[:split_point, :]
            #                 #             #     second_half = np_array[split_point:, :]
            #                 #             #     first_half_list = first_half.tolist()
            #                 #             #     second_half_list = second_half.tolist()
            #                 #             #     print('zr未分割总数：',len(two_dimensional_list))
            #                 #             #     print('zr分割后：',len(first_half_list), len(second_half_list))
            #                 #             #     self.send_kafka(first_half_list)
            #                 #             #     self.send_kafka(second_half_list)
            #                 #             # else:
            #                 #             #     self.send_kafka(two_dimensional_list)
            #                 # excel_filename = f'{cate_type}_{new_date}_example.xlsx'
            #                 # df.to_excel(excel_filename, index=False)
            #                 df.to_sql(db_name, con=self.engine_pg, if_exists="append", index=False)
            #             break
            #         except Exception as e:
            #             print(e, f"\n{traceback.format_exc()}", '存储关键词数据')
            #             time.sleep(5)
            #             continue

    def db_update_brand(self):
        if self.sort_all_list:
            while True:
                try:
                    self.engine_pg = self.pg_connect()
                    print(len(self.sort_all_list))
                    df_being_sold = pd.DataFrame(data=self.sort_all_list,
                                                 columns=['search_term', 'quantity_being_sold'])
                    # 获取成功抓取的搜索词来更改状态 3
                    year_week = self.date_info.replace('-', '_')
                    df_being_sold['week'] = self.week
                    df_being_sold['date_info'] = self.date_info
                    year_moth_list = self.date_info.split('-')
                    print('year_moth_list::,', year_moth_list)
                    print(f'存储表：：{self.db_brand_analytics}_{year_moth_list[0]}')
                    df_being_sold.drop_duplicates(['search_term', 'quantity_being_sold'], inplace=True)  # 去重
                    if df_being_sold.shape[0] > 0:
                        df_being_sold.to_sql(f'{self.db_brand_analytics}_{year_week}', con=self.engine_pg,
                                             if_exists='append',
                                             index=False)
                    break
                except Exception as e:
                    print('db_update_brand::', e, f"\n{traceback.format_exc()}")
                    time.sleep(5)
                    continue

    def send_kafka(self, items):
        print('传输 kafka 数据：：')
        topic = f"{self.site_name}_search_term_rank_type"
        for i in range(5):
            try:
                future = self.kafuka_producer.send(topic, items)
                self.kafuka_producer.flush()
                future.get(timeout=15)  # 等待10秒
                break
            except Exception as e:
                self.kafuka_producer = self.kafuka_connect()
                print("调用卡夫卡接口报错", e, f"\n{traceback.format_exc()}")
                time.sleep(20)

    def db_save_data(self):
        print('=========================  准备存储   ============================')
        # self.db_save_cookies()  # 选好邮编就开始存了
        if self.zr_all_list or self.sp_all_list:
            year_week = self.date_info.replace('-', '_')
            self.db_save_common(cate_type='zr', data_list=self.zr_all_list,
                                db_name=f"{self.db_search_term_zr}_{year_week}")
            self.db_save_common(cate_type='sp', data_list=self.sp_all_list,
                                db_name=f"{self.db_search_term_sp}_{year_week}")
            self.db_save_common(cate_type='sb', data_list=self.sb_all_list,
                                db_name=f"{self.db_search_term_sb}_{year_week}")
            self.db_save_common(cate_type='ac', data_list=self.ac_all_list,
                                db_name=f"{self.db_search_term_ac}_{year_week}")
            self.db_save_common(cate_type='bs', data_list=self.bs_all_list,
                                db_name=f"{self.db_search_term_bs}_{year_week}")
            self.db_save_common(cate_type='er', data_list=self.er_all_list,
                                db_name=f"{self.db_search_term_er}_{year_week}")
            self.db_save_common(cate_type='tr', data_list=self.tr_all_list,
                                db_name=f"{self.db_search_term_tr}_{year_week}")
            self.db_save_common(cate_type='buy', data_list=self.buy_text_list,
                                db_name=f"{self.db_other_search_term}_{year_week}")
            self.db_save_common(cate_type='hr', data_list=self.hr_list,
                                db_name=f"{self.db_search_term_hr}_{year_week}")
            self.db_update_brand()

#
# if __name__ == '__main__':
#     search_temp_mysql(site_name='de', week=19).run_pol()
