import datetime
import os
import queue
import sys
import threading
import time
import traceback
import socket
import uuid

import numpy as np
import pandas as pd
import redis
import logging

os.environ["PYARROW_IGNORE_TIMEZONE"] = "1"
sys.path.append(os.path.dirname(sys.path[0]))  # 上级目录
from sqlalchemy import create_engine
import json
import ast
import gzip
from sqlalchemy import text
from parse_html_parse import HtmlDetailsParser
htmldetailsparser = HtmlDetailsParser()
import concurrent.futures


class AsinParseHtmlToPg():
    def __init__(self, site_name='us', date_info='2024-06',  thread_num=5, limit=200):
        self.site_name = site_name
        self.thread_num = thread_num
        self.limit = limit
        self.html_table_name = f"asin_html_{date_info.replace('-', '_')}"

        # 连接 redis
        # connection_pool = redis.ConnectionPool(host='120.79.147.190', port=6379, db=9, password='Vm5vQH4ydFXh')
        # self.client = redis.Redis(connection_pool=connection_pool)

        # 连接 asin_html_2024_06 取数据
        sql_port = 19030
        sql_db = "us_spider"
        sql_user = "fangxingjun"
        sql_pwd = "fangxingjun12345"
        sql_host = "192.168.10.151" #内网
        # sql_host = "113.100.143.162"  # 外网
        connection_string_mysql = f"mysql+pymysql://{sql_user}:{sql_pwd}@{sql_host}:{sql_port}/{sql_db}"
        self.mysql_engine = create_engine(connection_string_mysql)

        # 连接 us_asin_detail_2024_wj 存数据
        # self.pg_port = 54328 #外网
        self.pg_port = 5433  # 内网
        self.pg_db = "selection"
        self.pg_user = "postgres"
        self.pg_pwd = "fazAqRRVV9vDmwDNRNb593ht5TxYVrfTyHJSJ3BS"
        # pg_host = "61.145.136.61"  # 外网
        pg_host = "192.168.10.223"  #内网
        self.db_engine192 = create_engine(
            f"postgresql://{self.pg_user}:{self.pg_pwd}@{pg_host}:{self.pg_port}/{self.pg_db}")

        # # 连接 self.html_table_name 取数据
        # sql_port = 19030
        # sql_db = "us_spider"
        # sql_user = "fangxingjun"
        # sql_pwd = "fangxingjun12345"
        # # sql_host = "192.168.10.151" #内网
        # sql_host = "113.100.143.162"  # 外网
        # connection_string_mysql = f"mysql+pymysql://{sql_user}:{sql_pwd}@{sql_host}:{sql_port}/{sql_db}"
        # self.mysql_engine = create_engine(connection_string_mysql)
        #
        # # 连接 us_asin_detail_2024_wj 存数据
        # # self.pg_port = 54328 #外网
        # self.pg_port = 5433  # 内网
        # self.pg_db = "selection"
        # self.pg_user = "postgres"
        # self.pg_pwd = "fazAqRRVV9vDmwDNRNb593ht5TxYVrfTyHJSJ3BS"
        # # pg_host = "61.145.136.61"  # 外网
        # pg_host = "192.168.10.223"  #内网
        # self.engine_pg14 = create_engine(
        #     f"postgresql://{self.pg_user}:{self.pg_pwd}@{pg_host}:{self.pg_port}/{self.pg_db}")

        self.db_params_dict = {
            "srs": {
                "db_name": "us_spider",
                "host": "192.168.10.151",
                "port": "19030",
                "user": "fangxingjun",
                "password": "fangxingjun12345",
                "db_conn": "mysql+pymysql",
            },
            "pg_14": {
                "db_name": "selection" if self.site_name=='us' else f"selection_{self.site_name}",
                "host": "192.168.10.223",
                "port": "5433",
                "user": "postgres",
                "password": "fazAqRRVV9vDmwDNRNb593ht5TxYVrfTyHJSJ3BS",
                "db_conn": "postgresql",
            },
        }
        self.engine_srs = self.create_db_connection(db_type="srs")
        self.engine_pg_14 = self.create_db_connection(db_type="pg_14")
        self.client_redis = redis.Redis(host='120.79.147.190', port=6379, db=9, password='fG7#vT6kQ1pX')


        self.local_name = f"{self.site_name}_asin_image_features"
        # self.hostname = socket.gethostname()
        self.timeout = 120
        self.id_list_queue = queue.Queue()
        self.columns = ['id','asin', 'img_url', 'title', 'title_len', 'price', 'rating', 'total_comments', "buy_box_seller_type",
                       "page_inventory", "category", "volume", "weight", "rank", "launch_time", "category_state", 'img_num',
                       "img_type", "activity_type", "one_two_val", "three_four_val", "five_six_val", "eight_val", "qa_num",
                       "one_star", "two_star", "three_star", "four_star", "low_star", "together_asin", "brand", "ac_name",
                       "material", "node_id", "data_type", "sp_num", "describe",
                       "date_info", "five_star", "weight_str", "package_quantity", "pattern_name", "spider_int",
                       "follow_sellers", "product_description", "buy_sales", "image_view", "product_json",
                       "product_detail_json", "review_ai_text", "review_label_json", "lob_asin_json",
                       "sp_initial_seen_asins_json", "sp_4stars_initial_seen_asins_json",
                       "sp_delivery_initial_seen_asins_json", "compare_similar_asin_json", "customer_reviews_json",
                       "together_asin_json", "min_match_asin_json", "seller_json", "variat_num", "current_asin", "img_list",
                       "variat_list", "parent_asin"
                       ]


    def create_db_connection(self, db_type="pg_14"):
        while True:
            try:
                return create_engine(f"{self.db_params_dict[db_type]['db_conn']}://{self.db_params_dict[db_type]['user']}:{self.db_params_dict[db_type]['password']}@{self.db_params_dict[db_type]['host']}:{self.db_params_dict[db_type]['port']}/{self.db_params_dict[db_type]['db_name']}")
            except Exception as e:
                print(f"{db_type}--建立数据库连接失败, 报错信息: {e}", traceback.format_exc())
                time.sleep(10)
                continue

    def reconnect_db_connection(self):
        self.engine_srs = self.create_db_connection(db_type="srs")
        self.engine_pg_14 = self.create_db_connection(db_type="pg_14")
        self.client_redis = redis.Redis(host='120.79.147.190', port=6379, db=9, password='fG7#vT6kQ1pX')


    # 加锁
    def acquire_lock(self, lock_name, timeout=120):
        """
        尝试获取分布式锁, 能正常设置锁的话返回True, 不能设置锁的话返回None
        lock_name: 锁的key, 建议和任务名称保持一致
        """
        lock_value = str(uuid.uuid4())
        lock_acquired = self.client_redis.set(lock_name, lock_value, nx=True, ex=timeout)  # 可以不设置超时时间
        # lock_acquired = self.client_redis.set(lock_name, lock_value, nx=True)
        return lock_acquired, lock_value

    # 释放锁
    def release_lock(self, lock_name, lock_value):
        """释放分布式锁"""
        script = """
        if redis.call("get", KEYS[1]) == ARGV[1] then
            return redis.call("del", KEYS[1])
        else
            return 0
        end
        """
        result = self.client_redis.eval(script, 1, lock_name, lock_value)
        return result

    # 解压 html
    @staticmethod
    def decompress_bytes(input_bytes):
        if isinstance(input_bytes, str):
            input_bytes = ast.literal_eval(input_bytes)
        return gzip.decompress(input_bytes).decode('utf-8')

    # 读取数据
    def read_data(self, thread_id):
        while True:
            try:
                if self.id_list_queue.empty():
                    lock_acquired, lock_value = self.acquire_lock(lock_name=self.local_name)
                    if lock_acquired:
                        self.get_id_list_queue()
                        self.release_lock(lock_name=self.local_name, lock_value=lock_value)
                    else:
                        print(f"lock_acquired: {lock_acquired}, 当前有其它进程占用redis的锁, 等待5秒继续获取id_list_queue数据")
                        time.sleep(5)  # 等待5s继续访问锁
                        continue
                else:
                    start_time = datetime.datetime.now()
                    id_tuple = self.id_list_queue.get()
                    id_tuple_str = str(tuple(id_tuple)).replace(",)", ")")
                    with self.engine_srs.begin() as connection:
                        sql_read = f"""SELECT id,asin, html FROM {self.html_table_name} WHERE site_name = 'us' and state = 2 and id in {id_tuple_str};"""
                        df_read = pd.read_sql(sql_read, con=connection)
                        print(f"sql_read: {sql_read[:200]}", df_read.shape)
                        sql_update = f"UPDATE {self.html_table_name} SET state = 3 WHERE id in {id_tuple_str};"
                        print(f"sql_update: {sql_update[:200]}")
                        connection.execute(sql_update)
                    end_time = datetime.datetime.now()
                    print(f"线程-{thread_id}, 读取数据量-{len(id_tuple)}, 耗时: {end_time - start_time}")
                    return df_read, id_tuple_str
            except Exception as e:
                print(f"线程-{thread_id}, 读取数据错误: {e}", traceback.format_exc())
                time.sleep(5)
                self.reconnect_db_connection()
                continue


    def save_data(self, df, id_tuple_str):
        try:
            df.to_sql("us_asin_detail_2024_wj", con=self.db_engine192, if_exists='append', index=False)
            print('保存成功')
            with self.mysql_engine.begin() as conn:
                id_tuple = tuple(df.id)
                if id_tuple:
                    sql_update = f"UPDATE asin_html_2024_06 SET state = 4 WHERE id IN {id_tuple_str}"
                    print(f"sql_update: {sql_update[:200]}")
                    conn.execute(sql_update)
        except Exception as e:
            print(f"保存数据错误准备更新为5: {e}", traceback.format_exc())
            time.sleep(5)
            try:
                self.update_state_to_5(df=df, id_tuple_str=id_tuple_str)
            except Exception as e:
                print(f'更新为5时发生错误：{e}')

    def save_data(self, df, id_tuple_str, thread_id):
        while True:
            try:
                df.to_sql("us_asin_detail_2024_wj", con=self.engine_pg_14, if_exists='append', index=False)
                print('保存成功')
                with self.engine_srs.begin() as conn:
                    id_tuple = tuple(df.id)
                    if id_tuple:
                        sql_update = f"UPDATE {self.html_table_name} SET state = 4 WHERE id IN {id_tuple_str}"
                        print(f"sql_update: {sql_update[:200]}")
                        conn.execute(sql_update)
                break
            except Exception as e:
                print(f"线程-{thread_id}, 存储数据错误, 等待5s继续, 并重新建立数据库连接: {e}", traceback.format_exc())
                time.sleep(5)
                self.reconnect_db_connection()
                continue


    @staticmethod
    def process_item(asin_org, html_org, id_org, decompress_bytes, parse_data_new):
        html_str = json.loads(html_org)
        html = decompress_bytes(html_str)
        item = parse_data_new(asin_org, html, id_org)
        return item

    def handle_data(self, df, id_tuple_str, thread_id):
        while True:
            try:
                start_time = datetime.datetime.now()
                data_lists = []
                with concurrent.futures.ProcessPoolExecutor(max_workers=10) as executor:
                    # 使用列表推导式将所有任务提交到线程池
                    future_to_item = {
                        executor.submit(self.process_item, asin_org, html_org, id_org, self.decompress_bytes, htmldetailsparser.parse_data_new): (asin_org, html_org, id_org)
                        for asin_org, html_org, id_org in zip(df['asin'], df['html'], df['id'])
                    }
                    for future in concurrent.futures.as_completed(future_to_item):
                        try:
                            item = future.result()
                            data_lists.append(item)
                        except Exception as e:
                            print(f"An error occurred: {e}")
                end_time = datetime.datetime.now()
                df_save = pd.DataFrame(data=data_lists, columns=self.columns)
                print(f"线程-{thread_id}, 处理数据量-{len(eval(id_tuple_str))}, 耗时: {end_time - start_time}")
                return df_save
            except Exception as e:
                print(f"线程-{thread_id}, 处理数据报错, {df['asin']}出错", e, traceback.format_exc())
                time.sleep(3)
                continue

    def get_id_list_queue(self, limit=10000, batch_size=200):
        while True:
            try:
                lock_acquired, lock_value = self.acquire_lock(lock_name=f"{self.local_name}_main")
                if lock_acquired:

                    sql = f"select id from asin_html_2024_06 WHERE site_name = '{self.site_name}' and state=1 LIMIT {limit} ;"
                    df_id = pd.read_sql(sql, con=self.mysql_engine)

                    sql = f"select id from {self.html_table_name} WHERE site_name = '{self.site_name}' and state=1 LIMIT {limit};"
                    print(f"sql--get_id_list_queue: {sql}")
                    df_id = pd.read_sql(sql, con=self.engine_srs)

                    id_tuple = tuple(df_id.id)
                    id_tuple_str = str(tuple(id_tuple)).replace(",)", ")")
                    with self.engine_srs.begin() as connection:
                        sql_update = f"UPDATE {self.html_table_name} SET state = 2 WHERE id in {id_tuple_str};"
                        print(f"sql_update: {sql_update[:200]}")
                        connection.execute(sql_update)
                    for i in range(0, len(id_tuple), batch_size):
                        batch = id_tuple[i:i + batch_size]
                        self.id_list_queue.put(batch)
                    self.release_lock(lock_name=f"{self.local_name}_main", lock_value=lock_value)
                    break
                else:
                    print(f"lock_acquired: {lock_acquired}, 当前有其它进程占用redis的锁, 等待5秒继续获取id_list_queue数据")
                    time.sleep(5)  # 等待5s继续访问锁
                    continue
            except Exception as e:
                print(f"读取数据--获取id队列数据错误: {e}", traceback.format_exc())
                time.sleep(5)

                try:
                    connection_pool = redis.ConnectionPool(host='120.79.147.190', port=6379, db=9, password='fG7#vT6kQ1pX')
                    self.client = redis.Redis(connection_pool=connection_pool)
                except Exception as e:
                    print(f"error: {e}", traceback.format_exc())
                continue



    def update_state_to_5(self,df, id_tuple_str):
        try:
            with self.mysql_engine.begin() as conn:
                id_tuple = tuple(df.id)
                if id_tuple:
                    sql_update = f"UPDATE asin_html_2024_06 SET state = 5 WHERE id IN {id_tuple_str}"
                    print("将失败的asin成功更新为5")
                    conn.execute(sql_update)
        except Exception as e:
            print(f' update_state_to_5出错:{e}')
            self.reconnect_db_connection()



    def run(self, thread_id):
        while True:
            try:
                start_time = datetime.datetime.now()
                df_read, id_tuple_str = self.read_data(thread_id=thread_id)
                if df_read.shape[0]:
                    df_save = self.handle_data(df=df_read, thread_id=thread_id, id_tuple_str=id_tuple_str)
                    self.save_data(df=df_save, id_tuple_str=id_tuple_str, thread_id=thread_id)
                    end_time = datetime.datetime.now()
                    print(f"线程-{thread_id}的完整流程, 完成数量-{self.limit}, 总耗时: {end_time-start_time}")
                    # break
                else:
                    break
            except Exception as e:

                print(2222222222, e, traceback.format_exc())
                try:
                    connection_pool = redis.ConnectionPool(host='120.79.147.190', port=6379, db=9, password='fG7#vT6kQ1pX')
                    self.client = redis.Redis(connection_pool=connection_pool)
                except Exception as e:
                    print(f"error: {e}", traceback.format_exc())

                print(f"单个线程{thread_id}的完整流程报错, 等待10s继续, 并初始化数据库连接:", e, traceback.format_exc())
                time.sleep(10)
                self.reconnect_db_connection()

                continue


    def run_thread(self):
        thread_list = []
        for thread_id in range(self.thread_num):
            thread = threading.Thread(target=self.run, args=(thread_id, ))
            thread_list.append(thread)
            thread.start()
        for thread in thread_list:
            thread.join()
        logging.info("所有线程处理完成")


if __name__ == '__main__':
    site_name = 'us'
    thread_num = 5
    limit = 200
    handle_obj = AsinParseHtmlToPg(site_name=site_name, thread_num=thread_num, limit=limit)
    handle_obj.run_thread()



