import sys
import os

sys.path.append(os.path.dirname(sys.path[0]))  # 上级目录
from utils.db_connect import BaseUtils
from amazon_save_db.save_asin_html_params_data import Save_asin_detail
from multiprocessing import Pool
import pandas as pd
import traceback
from queue import Queue
import threading
import json
import ast
import gzip
from utils.asin_parse import ParseAsinUs
from datetime import datetime
from func_timeout.exceptions import FunctionTimedOut
import socket

# 数据库连接
engine_strrocks = BaseUtils().starrocks_connect()

engine_pg14 = BaseUtils().pg_connect()
import time


class Parse_asin_html():
    def __init__(self, site_name=None, date_info=None):
        self.site_name = site_name  # 站点
        self.date_info = date_info
        self.read_size = 10
        self.asin_html_queue = Queue()
        self.spider_int = 0
        self.month = self.date_info.split('-')[-1]
        self.asin_state_queue = Queue()
        self.db_syn = f'asin_html_{date_info.replace("-", "_")}'
        self.item_queue = Queue()
        self.buyBox_list = []
        self.buyBoxname_asin_list = []
        self.bs_category_asin_list_pg = []
        self.all_img_video_list = []
        self.asin_variation_list = []
        self.save_asin_detail = Save_asin_detail(site_name=self.site_name, proxy_name=None, month=self.month,
                                                 spider_int=1)
        self.kafuka_producer = BaseUtils().kafuka_connect(bootstrap_servers=True)
        self.engine_strrocks = BaseUtils().starrocks_connect()

    def decompress_bytes(self, input_bytes):
        if isinstance(input_bytes, str):
            input_bytes = ast.literal_eval(input_bytes)
        return gzip.decompress(input_bytes).decode('utf-8')

    def get_asin_html(self):
        while True:
            if self.asin_html_queue.empty() == False:
                asin_html_str = self.asin_html_queue.get()
                asin_html_str_list = asin_html_str.split('|-|-|-|-|-|')
                asin = asin_html_str_list[0]
                asin_b_html = asin_html_str_list[1]
                created_time = str(asin_html_str_list[2])
                html_str = json.loads(asin_b_html)
                asin_html = self.decompress_bytes(html_str)  # 解压缩字节对象
                items = ParseAsinUs(resp=asin_html, asin=asin, month=self.month, date_info=self.date_info,
                                    site_name=self.site_name).xpath_html()
                new_date = created_time
                item = {'asin': items["asin"], 'week': items["week"], 'month': items["month"],
                        'title': items["title"],
                        'img_url': items["img_url"],
                        'rating': items["rating"],
                        'total_comments': items["total_comments"],
                        'price': items["price"], "rank": items["rank"], 'category': items["category"],
                        'launch_time': items["launch_time"],
                        'volume': items["volume"],
                        'weight': items["weight"], "page_inventory": items["page_inventory"],
                        "buy_box_seller_type": items["buy_box_seller_type"],
                        "asin_vartion_list": items["asin_vartion_list"], 'title_len': items["title_len"],
                        'img_num': items["img_num"], 'img_type': items["img_type"],
                        'activity_type': items["activity_type"],
                        'one_two_val': items["one_two_val"], 'three_four_val': items["three_four_val"],
                        'eight_val': items["eight_val"],
                        'qa_num': items["qa_num"], 'five_star': items["five_star"], 'four_star': items["four_star"],
                        'three_star': items["three_star"],
                        'two_star': items["two_star"], 'one_star': items["one_star"], 'low_star': items["low_star"],
                        'together_asin': items["together_asin"],
                        'brand': items["brand"], 'ac_name': items["ac_name"], 'material': items["material"],
                        'node_id': items["node_id"], 'data_type': 1,
                        'sp_num': items["sp_num"], 'describe': items["describe"], 'date_info': self.date_info,
                        'weight_str': items["weight_str"], 'package_quantity': items['package_quantity'],
                        'pattern_name': items['pattern_name'], 'seller_id': items["seller_id"],
                        'variat_num': items['variat_num'],
                        'site_name': self.site_name, 'best_sellers_rank': items["best_sellers_rank"],
                        'best_sellers_herf': items["best_sellers_herf"], 'account_url': items["account_url"],
                        'account_name': items["account_name"], 'parentAsin': items["parentAsin"],
                        'asinUpdateTime': new_date, 'follow_sellers': items['sellers_num'],
                        'spider_int': self.spider_int, 'all_best_sellers_herf': items['all_best_sellers_herf'],
                        'product_description': items['product_description'], 'buy_sales': items['buySales'],
                        'image_view': items['image_view'], 'product_json': items['product_json'],
                        'product_detail_json': items['productdetail_json'],
                        'review_ai_text': items['review_ai_text'], 'review_label_json': items['review_label_json'],
                        'lob_asin_json': items['lob_asin_json'],
                        'sp_initial_seen_asins_json': items['sp_initial_seen_asins_json'],
                        'sp_4stars_initial_seen_asins_json': items['sp_4stars_initial_seen_asins_json'],
                        'sp_delivery_initial_seen_asins_json': items['sp_delivery_initial_seen_asins_json'],
                        'compare_similar_asin_json': items['compare_similar_asin_json'],
                        'customer_reviews_json': items['customer_reviews_json'],
                        'together_asin_json': items['together_asin_json'],
                        'min_match_asin_json': items['min_match_asin_json'], 'seller_json': items['seller_json'],
                        'created_time': new_date, 'current_asin': items['current_asin'],
                        'parent_asin': items["parentAsin"]
                        }
                if self.site_name in ['uk', 'de', 'fr', 'es', 'it']:
                    item['five_six_val'] = items['five_six_val']
                else:
                    item['five_six_val'] = None

                if items["buyBox_list"]:
                    self.buyBox_list.extend(items["buyBox_list"])
                if items["buyBoxname_asin_list"]:
                    self.buyBoxname_asin_list.extend(items["buyBoxname_asin_list"])
                if items["bs_category_asin_list_pg"]:
                    self.bs_category_asin_list_pg.extend(items["bs_category_asin_list_pg"])

                if items["asin_variation_list"]:
                    item['variat_list'] = json.dumps(items["asin_variation_list"])  # 变体
                else:
                    item['variat_list'] = None
                item['asin_vartion_list'] = items["asin_variation_list"]
                if items["all_img_video_list"]:
                    item['img_list'] = json.dumps(items["all_img_video_list"])
                else:
                    item['img_list'] = None
                self.item_queue.put(item)
                if item['img_list'] is None:
                    item['img_list'] = []
                self.item_queue.put(item)
                self.asin_state_queue.put(item['asin'])
            else:
                print('当前线程完成')
                break

    # def send_kafka(self, items=None, html_data=None):
    #     for i in range(3):
    #         try:
    #             topic = f'{self.site_name}_asin_detail_month_2024_{self.month}'
    #             # topic = 'us_asin_detail_month_wr_test2'
    #             print('卡夫卡传输：', topic)
    #             self.kafuka_producer.send(topic, json.dumps(items))
    #             break
    #         except Exception as e:
    #             print("kafka报错：", e, f"\n{traceback.format_exc()}")
    #             time.sleep(15)

    def db_change_state_common(self):
        while True:
            asin_list = []
            # 从队列中获取最多1000个ASIN
            for _ in range(1000):
                if not self.asin_state_queue.empty():
                    asin = self.asin_state_queue.get()
                    asin_list.append(asin)
                else:
                    break
            # 如果asin_list不为空，进行数据库更新
            print('state=3asin_list:', len(asin_list))
            if asin_list:
                while True:
                    try:
                        with self.engine_strrocks.begin() as conn:
                            # sql_column = 'set partial_update_mode="column";'
                            # conn.execute(sql_column)
                            if len(asin_list) == 1:
                                sql_update = f"UPDATE {self.db_syn} SET state=3 WHERE asin='{asin_list[0]}' AND state=2;"
                            else:
                                asins = "', '".join(asin_list)
                                sql_update = f"UPDATE {self.db_syn} SET state=3 WHERE asin IN ('{asins}') AND state=2;"
                            conn.execute(sql_update)
                        break
                    except Exception as e:
                        print(f"更改{self.db_syn}表错误", e, f"\n{traceback.format_exc()}")
                        time.sleep(15)
                        continue
            else:
                # 如果asin_list为空，说明队列已经处理完毕，跳出外层循环
                break

    def init_list(self):
        self.asin_state_queue = Queue()
        self.asin_html_queue = Queue()
        self.item_queue = Queue()
        self.buyBox_list = []
        self.buyBoxname_asin_list = []
        self.bs_category_asin_list_pg = []
        self.all_img_video_list = []
        self.asin_variation_list = []
        self.kafuka_producer.close()

    def run_pol(self, asin_html_list):
        for asin_html in asin_html_list:
            self.asin_html_queue.put(asin_html)
        html_thread = []
        for i in range(50):
            thread2 = threading.Thread(target=self.get_asin_html)
            html_thread.append(thread2)
        for ti in html_thread:
            ti.start()
        for t2 in html_thread:
            t2.join()
        print("存储数据 存储asin详情，信息，")
        while True:
            try:
                self.save_asin_detail.process_item(self.item_queue, None,
                                                   None, None,
                                                   None, None,
                                                   None, None,
                                                   None, None,
                                                   None, None,
                                                   None,
                                                   None, self.month)
                break
            except FunctionTimedOut as e:
                print('断网', e)
        print('开始刷新')
        self.kafuka_producer.flush()
        print('刷新完成，修改状态')
        self.db_change_state_common()
        print('初始化变量')
        self.init_list()


def db_read_data_common(start_id, limit, site_name, db_search_term):
    while True:
        try:
            with engine_strrocks.begin() as conn:
                sql_read = f"SELECT id, asin, html,created_time FROM {db_search_term} WHERE state=1 and site_name='{site_name}' and id BETWEEN {start_id} AND {start_id + limit - 1}"
                print(sql_read)
                a = conn.execute(sql_read)
                df_read = pd.DataFrame(a, columns=['id', 'asin', 'html', 'created_time'])
                if df_read.shape[0] > 0:
                    # sql_column = 'set partial_update_mode="column";'
                    # conn.execute(sql_column)
                    id_tuple = tuple(df_read.id)
                    if len(id_tuple) == 1:
                        sql_update = f'UPDATE {db_search_term} set state=2 where id in ({id_tuple[0]});'
                    else:
                        sql_update = f'UPDATE {db_search_term} set state=2 where id in {id_tuple};'
                    conn.execute(sql_update)
                    search_term_list = list(
                        df_read.asin + '|-|-|-|-|-|' + df_read.html + '|-|-|-|-|-|' + df_read.created_time.astype("U"))
                    return search_term_list
                else:
                    return []
        except Exception as e:
            print("读取数据出bug并等待5s继续", e, f"\n{traceback.format_exc()}")
            time.sleep(15)
            continue


def worker(start_id, limit, site_name, date_info, table_name):
    search_term_list = db_read_data_common(start_id, limit, site_name, table_name)
    if search_term_list:
        Parse_asin_html(site_name=site_name, date_info=date_info).run_pol(search_term_list)


def get_asin_html_count(site_name, date_info, table_name):
    sql_read = f"SELECT max(id), min(id) FROM {table_name} WHERE state=1 and site_name='{site_name}'"
    df = pd.read_sql(sql_read, con=engine_strrocks)
    max_id = df.iloc[0, 0]
    min_id = df.iloc[0, 1]
    print(max_id)
    print(min_id)
    return max_id, min_id


def get_ip_address():
    # 返回内网ip
    s = socket.socket(socket.AF_INET, socket.SOCK_DGRAM)
    s.connect(('baidu.com', 0))
    ip = s.getsockname()[0]
    print('内网ip：', ip)
    return ip


def sava_maxid_minid(data_range_list, site_name):
    df_id = pd.DataFrame(data=data_range_list, columns=['maxid', 'batch_size'])
    df_id.to_sql(f'{site_name}_asin_html_maxid_minid', con=engine_pg14,
                 if_exists='append',
                 index=False)


def get_sava_maxid_minid(site_name):
    with engine_pg14.begin() as conn:
        sql_read = f"select id, maxid, batch_size from {site_name}_asin_html_maxid_minid where state=1 limit 1 for update"
        print(sql_read)
        a = conn.execute(sql_read)
        df_read = pd.DataFrame(a, columns=['id', 'maxid', 'batch_size'])
        if df_read.shape[0] > 0:
            id_tuple = tuple(df_read.id)
            if len(id_tuple) == 1:
                sql_update = f'UPDATE {site_name}_asin_html_maxid_minid set state=2 where id in ({id_tuple[0]});'
            else:
                sql_update = f'UPDATE {site_name}_asin_html_maxid_minid set state=2 where id in {id_tuple};'
            conn.execute(sql_update)
            maxid_minid_list = list(df_read.maxid.astype("U") + '|-|-|-|-|-|' + df_read.batch_size.astype("U"))
            return maxid_minid_list
        else:
            return []


def split_task(start_id, limit, num_splits):
    split_limits = []
    sub_limit = limit // num_splits
    for i in range(num_splits):
        split_start_id = start_id + i * sub_limit
        if i == num_splits - 1:  # The last split takes the remaining range
            split_limits.append((split_start_id, limit - i * sub_limit))
        else:
            split_limits.append((split_start_id, sub_limit))
    return split_limits


if __name__ == '__main__':
    site_name = 'us'  # 站点
    date_info = '2024-07'  # date_info
    batch_size = 3000  # 每个批次查询5000条
    num_processes = 15  # 开启多少个进程
    table_name = f"asin_html_{date_info.replace('-', '_')}"
    # # if get_ip_address() == '192.168.200.210':
    # max_id, min_id = get_asin_html_count(site_name, date_info, table_name)
    # total_data = max_id - min_id + 1  # 计算总的数据量，加1是因为要包含最小和最大ID
    # # 计算每个进程处理的数据范围
    # data_range = []
    # start_id = min_id
    # while start_id <= max_id:  # 开始id == 最大id 跳出循环
    #     # end_id 每次加5000
    #     end_id = min(start_id + batch_size - 1, max_id)  # 防止超出最大ID,
    #     data_range.append((start_id, end_id - start_id + 1))  # 计算每个范围的行数
    #     start_id = end_id + 1  # 更新起始ID为下一个范围的起始ID
    # sava_maxid_minid(data_range, site_name)
    while True:
        maxid_minid_list = get_sava_maxid_minid(site_name)
        if not maxid_minid_list:
            print("所有批次处理完成")
            break
        p = Pool(num_processes)
        print('maxid_minid_list::', maxid_minid_list)
        start_id_limit_list = []
        for start_limit in maxid_minid_list:
            start_id_limit = start_limit.split('|-|-|-|-|-|')
            start_id = int(start_id_limit[0])
            limit = int(start_id_limit[1])
            # 将任务进一步拆分为10个子任务
            sub_tasks = split_task(start_id, limit, num_processes)
            for sub_start_id, sub_limit in sub_tasks:
                p.apply_async(worker, args=(sub_start_id, sub_limit, site_name, date_info, table_name))
        p.close()
        p.join()
        print('结束当前进程')
        time.sleep(3)
        # 数据库连接
        engine_pg14 = BaseUtils().pg_connect()
        # break


