import sys
import os

sys.path.append(os.path.dirname(sys.path[0]))  # 上级目录
from utils.db_connect import BaseUtils
from amazon_params import py_ja3
from utils.requests_param import Requests_param_val

from amazon_params.params import DB_SEARCH_TERM_PARAMS_SPIDER
from lxml import etree
from queue import Queue
import time
import re
import random
from curl_cffi import requests, Curl
import pandas as pd
from threading import Lock
import threading
import urllib3

import gzip
import requests as requests2

sess = requests2.Session()
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)


class search_temp_pg(BaseUtils):
    def __init__(self, site_name='us', read_size=300, proxy_name=None, week=None, month=None):
        super().__init__()
        self.site_name = site_name  # 站点
        self.month = month
        print(site_name, self.month, 'proxy_name-proxy_name:', proxy_name)
        self.reuests_para_val = Requests_param_val(site_name=self.site_name, spider="search_term",
                                                   proxy_name=proxy_name)
        self.read_size = read_size
        self.cookies_queue = Queue()  # cookie队列
        self.search_term_queue = Queue()  # 需要爬取的asin队列
        self.search_term_priority_queue = Queue()  # 优先抓取队列
        self.search_term_priority_list = []  # 优先抓取列表
        self.pppoe_ip_queue = Queue()
        self.search_term_list = []  # 存放search_term的列表
        self.search_term_not_found = []  # 变狗页面
        self.asin_not_sure_list = []  # 没有关键词相关结果的列表
        # 返回 对应站点的host，首页链接
        self.site_url, self.host = self.reuests_para_val.get_site_url(self.site_name)
        self.search_term_not_next_page_list = []  # 存放找不到下一页关键词的url队列
        self.stop_item_queue = True  # 用于是否退出循环存储的条件
        # 关键词相关
        self.search_term_url_list = []  # 存放search_term+url的列表
        self.search_term_list_update = []  # 存储成功的搜索词
        # 初始化数据库相关参数
        self.week_list = []
        self.db_name_change_common()  # 初始化对应站点的表名 + 初始化站点的周期
        self.df_read = pd.DataFrame()
        self.id_tuple = ()
        self.nums_success = 0  # 每次循环成功的关键词数量
        # 页面解析(zr和sp才有page和page_row)
        self.zr_all_list = []
        self.sp_all_list = []
        self.sb_all_list = []
        self.ac_all_list = []
        self.bs_all_list = []
        self.er_all_list = []
        self.tr_all_list = []
        self.buy_text_list = []
        self.hr_list = []
        self.sort_all_list = []
        self.columns = ['search_term', 'asin', 'page', 'page_row', 'data_type',
                        'title', 'img_url', 'price', 'rating', 'total_comments']
        self.df_asin_detail_simply_list = []
        self.not_sp_url_kw_list = []
        self.nums_no_sp = 0
        self.st_list = []
        self.delete_cookies_list = []  # 存储出现中国邮编的cookie
        self.headers_num_int = 0
        self.headers_num_int_s = 0
        self.search_term_html_queue = Queue()
        # 创建一个队列用于将数据传递给数据库插入函数
        self.insert_queue = Queue()
        self.keyword_html_data_list = []
        # 存储html 主题
        self.search_term_html_topic = f'search_term_html_2024_{self.month}'
        # 存储数据详情
        self.search_term_month_topic = f'{self.site_name}_search_term_month_2024_{self.month}'
        print('存储html 主题:', self.search_term_html_topic)
        print('存储数据详情 主题:', self.search_term_month_topic)
        self.data_list_asin = []

    def db_name_change_common(self):
        self.engine_pg = self.pg_connect()
        self.kafuka_producer = self.kafuka_connect(acks=True)
        self.db_search_term = self.site_name + DB_SEARCH_TERM_PARAMS_SPIDER["db_search_term"][2:] + '_month_syn'
        self.db_search_term_zr = self.site_name + DB_SEARCH_TERM_PARAMS_SPIDER["db_search_term_zr"][2:] + '_month'
        self.db_search_term_sp = self.site_name + DB_SEARCH_TERM_PARAMS_SPIDER["db_search_term_sp"][2:] + '_month'
        self.db_search_term_sb = self.site_name + DB_SEARCH_TERM_PARAMS_SPIDER["db_search_term_sb"][2:] + '_month'
        self.db_search_term_ac = self.site_name + DB_SEARCH_TERM_PARAMS_SPIDER["db_search_term_ac"][2:] + '_month'
        self.db_search_term_bs = self.site_name + DB_SEARCH_TERM_PARAMS_SPIDER["db_search_term_bs"][2:] + '_month'
        self.db_search_term_er = self.site_name + DB_SEARCH_TERM_PARAMS_SPIDER["db_search_term_er"][2:] + '_month'
        self.db_search_term_tr = self.site_name + DB_SEARCH_TERM_PARAMS_SPIDER["db_search_term_tr"][2:] + '_month'
        self.db_search_term_hr = self.site_name + DB_SEARCH_TERM_PARAMS_SPIDER["db_search_term_hr"][2:] + '_month'
        self.db_other_search_term = self.site_name + DB_SEARCH_TERM_PARAMS_SPIDER["db_other_search_term"][2:] + '_month'
        self.db_brand_analytics = self.site_name + DB_SEARCH_TERM_PARAMS_SPIDER[f"us_brand_analytics"][2:] + '_month'

    def get_search_kw(self, t_num, page=1):
        while True:
            if self.search_term_queue.empty() == False:
                keywords_scraper_url = self.search_term_queue.get()
                keywords_scraper_url_list = keywords_scraper_url.split('|||')
                keywords_id = keywords_scraper_url_list[0]

                scraper_url = keywords_scraper_url_list[1]
                print(keywords_id)
                print(scraper_url)
                cookies_dict = self.reuests_para_val.get_cookie()
                self.cookie_dict_delete_id = cookies_dict
                for ck in cookies_dict.values():
                    self.cookies_queue.put(ck)
                # 获取组装cookie
                cookie_str = self.reuests_para_val.get_cookie_str(self.cookies_queue)
                headers = self.reuests_para_val.requests_amazon_headers(host=self.host, site_url=self.site_url,
                                                                        asin=None, scraper_url=scraper_url)
                headers["cookie"] = cookie_str
                for i in range(5):
                    try:
                        if self.headers_num_int > 120:
                            resp = requests.get(scraper_url, impersonate="chrome110", headers=headers,
                                                timeout=10, verify=False)
                        else:
                            sess.mount(self.site_url, py_ja3.DESAdapter())
                            resp = sess.get(scraper_url, headers=headers,
                                            timeout=10, verify=False)
                        if self.reuests_para_val.check_amazon_yzm(resp):
                            self.search_term_priority_list.append(keywords_id)
                            self.headers_num_int += 1
                            continue
                        break
                    except Exception as e:
                        print('请求报错：', e)
                        time.sleep(random.uniform(5, 17.5))
                        continue
                response = resp.text
                etree_html = etree.HTML(response)
                try:
                    ingress = etree_html.xpath("//span[@id='glow-ingress-line2']/text()")
                    print("**************  邮编  ***************", ingress)
                    ingress = ingress[0].strip()
                except:
                    ingress = None
                    print("获取邮编錯誤:")
                # 获取月销

                # 获取asin

                asin_list = etree_html.xpath(
                    '//div[@class="a-section a-spacing-none faceout-product-title"]//a//@href|//a[contains(@aria-label,"Sponsored")]/@href|//div[@data-asin]/@data-asin')
                for href in asin_list:
                    if len(href) > 10 and 'https' in href:
                        print(href, 'hrefhrefhref')
                        asins = re.findall("https://www.*/dp/(.*?)\?", href)
                        if len(asins) == 0:
                            asins = re.findall("asins=(.*?)&", href)
                            if asins:
                                if len(asins[0]) > 10:
                                    asins[0] = asins[0][:10]
                            else:
                                # 优化sbv 正则只匹配英文 数字，判断
                                asins = re.findall("asins=(.*?)%2", href)
                                if asins:
                                    if len(asins[0]) > 10:
                                        asins[0] = asins[0][:10]
                        else:
                            if len(asins[0]) > 10:
                                asins[0] = asins[0][:10]
                        try:
                            href = asins[0]
                        except:
                            href = None
                    elif 'javascript' in href:
                        href = None
                    if href:
                        buy_num = etree_html.xpath(
                            f'//div[@data-dib-asin="{href}"]//span[contains(text(),"bought in past")]/text()|//div[@data-asin="{href}"]//span[contains(text(),"bought in past")]/text()')
                        print(href, buy_num, page)
                        if buy_num:
                            buy_num = buy_num[0]
                            t = re.findall(r'(.*?)\+', buy_num)
                            b_num = t
                        else:
                            buy_num = 0
                            b_num = None
                        asin_data_list = [keywords_id, page, href, b_num, buy_num]
                        self.data_list_asin.append(asin_data_list)
                self.db_save_data(self.data_list_asin, page)
                # 判断是否有下一页
                lock = Lock()
                lock.acquire()
                no_results = 'No results for'
                if no_results in response:
                    pass
                else:
                    Next_list = etree_html.xpath("//a[contains(text(),'Next')]/@href")
                    if Next_list:
                        Next_url = 'https://www.amazon.com' + Next_list[0]
                        print(Next_url, '下一页url')
                        search_url = keywords_id + '|||' + Next_url
                        self.search_term_queue.put(search_url)
                        page += 1
                    else:
                        break
                lock.release()
                # if page > 3:
                #     break
                # with open(rf'D:\新建文件夹\requests_files\13291371.html','w',encoding='utf-8')as f:
                #     f.write(response)
                # if page == 1:
            else:
                print(f"当前线程-{t_num} 已完成-爬取-跳出循环")
                break

    # def xpath_html(self):

    # 压缩字符串
    def compress_string(self, input_string):
        return gzip.compress(input_string.encode())

    # 解压缩字符串
    def decompress_string(self, input_bytes):
        # 在Python中，字节对象和它们的字符串表示形式是不同的。例如，字节对象 b'hello' 和字符串 'b'hello'' 是不同的，尽管它们看起来很相似。
        # 当你从数据库中读取压缩的数据时，如果数据被存储为二进制格式，那么你需要以二进制形式读取它，然后解压缩。但是，如果数据被存储为字符串（
        # 可能是因为数据库的限制或其他原因），那么你需要先将字符串转换为字节对象，然后再解压缩。
        import ast
        if isinstance(input_bytes, str):
            # html_str = json.loads(html_str)
            input_bytes = ast.literal_eval(input_bytes)
        return gzip.decompress(input_bytes).decode('utf-8')

    def init_list(self):
        print("=======清空变量==========")
        self.item_queue = Queue()  # 存储 item 详情数据队列
        self.search_term_queue = Queue()  # 需要爬取的asin队列
        self.buyBox_list = []  # 卖家名称 url 列表
        self.delete_cookies_list = []  # 存储出现中国邮编的cookie
        self.search_term_not_found = []  # 没有关键词相关结果的列表
        self.asin_not_sure_list = []  # 没有关键词相关结果的列表
        self.search_term_not_next_page_list = []  # 存放找不到下一页关键词的url队列
        self.search_term_url_list = []  # 存储要爬取的url
        self.zr_all_list = []
        self.sp_all_list = []
        self.sb_all_list = []
        self.ac_all_list = []
        self.bs_all_list = []
        self.er_all_list = []
        self.tr_all_list = []
        self.buy_text_list = []
        self.hr_list = []
        self.sort_all_list = []
        self.df_asin_detail_simply_list = []
        self.cookies_list = []
        self.search_term_list = []  # 存放search_term的列表
        self.headers_num_int = 0
        self.search_term_html_queue = Queue()
        # 创建一个队列用于将数据传递给数据库插入函数
        self.insert_queue = Queue()
        self.keyword_html_data_list = []
        self.kafuka_producer.close()

    def run_pol(self):
        search_term_list = [
            'Clothing, Shoes & Jewelry›Women›Clothing›Tops, Tees & Blouses›Tanks & Camis|||https://www.amazon.com/b/node=2368344011']
        if search_term_list:
            if self.cookies_queue.empty():
                cookies_dict = self.reuests_para_val.get_cookie()
                self.cookie_dict_delete_id = cookies_dict
                for ck in cookies_dict.values():
                    self.cookies_queue.put(ck)
            for search_url in search_term_list:
                self.search_term_queue.put(search_url)
            html_thread = []
            for i in range(1):
                thread2 = threading.Thread(target=self.get_search_kw, args=(i,))
                html_thread.append(thread2)
            for ti in html_thread:
                ti.start()
            for t2 in html_thread:
                t2.join()
            print('最后刷新kafka flush')
            self.kafuka_producer.flush()
            print('当前线程抓取结束')

            # 删除cookie
            print("删除cookie：", len(self.delete_cookies_list))
            self.reuests_para_val.delete_china_cookie(list(set(self.delete_cookies_list)))
            # 清空变量，
            self.init_list()

    def db_save_data(self, data_list, page):
        for i in range(5):
            try:
                engine_mysql = self.mysql_connect()
                df_being_sold = pd.DataFrame(data=data_list, columns=['category', 'page', 'asin', 'buy', 'buy_str'])
                df_being_sold.drop_duplicates(['page', 'asin'], inplace=True)  # 去重
                df_being_sold.to_sql('category_asin_data_pyb_copy1', con=engine_mysql,
                                     if_exists='append',
                                     index=False)
                self.data_list_asin = []
                break
            except:
                print('存储报错')
                time.sleep(random.uniform(5, 17.5))


if __name__ == '__main__':
    search_temp_pg().run_pol()
