import requests
import json
import datetime
import threading
import queue
from queue import Queue
from multiprocessing import Pool
from all_connect import ConnectSpider
Con = ConnectSpider()
from lxml import etree


class HD_details:
    def __init__(self):
        self.headers = {
            'Referer': 'https://www.homedepot.com/s/02%20sensor%20downstream?NCNI-5',
            'Upgrade-Insecure-Requests': '1',
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/127.0.0.0 Safari/537.36',
            'sec-ch-ua': '"Not)A;Brand";v="99", "Google Chrome";v="127", "Chromium";v="127"',
            'sec-ch-ua-mobile': '?0',
            'sec-ch-ua-platform': '"Windows"',
        }
        self.cookies = {
            'AMCV_F6421253512D2C100A490D45%40AdobeOrg': 'MCMID|16129746417315936432497630021679731371',
        }
        self.item_queue = Queue()
        self.success_queue = Queue()
        self.failed_queue = Queue()
        self.invalid_queue = Queue()




    def fetch_page(self, url, detail_id):
        retries = 0
        max_retries = 3
        while retries < max_retries:
            try:
                response = requests.get(url, headers=self.headers, cookies=self.cookies)
                if response.status_code == 200:
                    return response.text
                else:
                    print(f"Error Status code {response.status_code}")
                    try:
                        self.failed_queue.put([detail_id])
                        print(f'将失败id:{detail_id} 放入失败队列')
                    except Exception as e:
                        print('failed_queue队列已满，写入失败')
                    return None

            except Exception as e:
                if "Connection aborted." in str(e):
                    retries += 1
                    print(f'{detail_id}请求错误：{e},重试次数: {retries}')
                else:
                    print(f'fetch_page error:{e}')
                    try:
                        self.failed_queue.put([detail_id])
                        print(f'将失败id:{detail_id} 放入失败队列')
                    except Exception as e:
                        print('failed_queue队列已满，写入失败')
                    return None  # 确保返回 None 而不是 NoneType
        print(f'{detail_id}超过重试次数')
        try:
            self.failed_queue.put([detail_id])
            print(f'将失败id:{detail_id} 放入失败队列')
        except Exception as e:
            print('failed_queue队列已满，写入失败')
        return None  # 确保返回 None 而不是 NoneType

    def parse_html(self, html_content,detail_id):
        if not isinstance(html_content, str):
            print(f"Invalid html_content type: {type(html_content)}")
            try:
                self.failed_queue.put([detail_id])
                print(f'将失败id:{detail_id} 放入失败队列')
            except Exception as e:
                print('failed_queue队列已满，写入失败')
            return []  # 返回空列表以避免后续解包错误
        Html = etree.HTML(html_content)
        try:
            salient_points = '||'.join(Html.xpath('//ul[@class="sui-text-base sui-list-disc sui-list-inside"]//text()'))
            productStructureData = Html.xpath('//script[@id="thd-helmet__script--productStructureData"]/text()')
            if productStructureData:
                productStructureData = json.loads(productStructureData[0])
                description = productStructureData['description'] if productStructureData['description'] else None
                model = productStructureData['model'] if productStructureData['model'] else None
            else:
                description = None
                model = None
            return salient_points, description, model
        except Exception as e:
            print(f"Parsing error: {e}")
            try:
                self.failed_queue.put([detail_id])
                print(f'将失败id:{detail_id} 放入失败队列')
            except Exception as e:
                print('failed_queue队列已满，写入失败')
            return []

    def save_data(self,detail_id,search_term, title, price, url, image, sku, salient_points, description, model):
        item = {}
        item['search_term'] = search_term
        item['title'] = title
        item['price'] = price
        item['url'] = url
        item['image'] = image
        item['sku'] = sku
        item['salient_points'] = salient_points
        item['description'] = description
        item['model'] = model
        item['created_at'] = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')
        # print(item)
        item_data = (item, detail_id)
        try:
            self.item_queue.put(item_data)
        except Exception as exc:
            print('item_queue队列已满，写入失败')


    def process_keyword(self, search_term, detail_id, title, price, url, image, sku):
        html_content = self.fetch_page(url,detail_id)
        if html_content is None:
            print(f"Failed to fetch page for URL: {url}")
            return  # 如果获取页面失败，直接返回
        salient_points, description, model = self.parse_html(html_content,detail_id)
        self.save_data(detail_id, search_term, title, price, url, image, sku, salient_points, description, model)

    def update(self, success_id, failed_id):
        try:
            if success_id[0]:
                print(f'成功的id:{success_id}')
                Con.hd_details_upstate_to_3(success_id)
                print("所有数据处理完成，状态已更新为3。")

            if failed_id:
                print(f'失败的id:{failed_id}')
                Con.hd_details_upstate_to_4(failed_id)
                print("所有数据处理完成，状态已更新为4。")

        except Exception as e:
            print(f"更新状态时发生错误: {e}")

    def workers(self, task_queue):
        while not task_queue.empty():
            try:
                search_term, detail_id,title,price,url,image,sku = task_queue.get(timeout=10)
            except Exception as e:
                print('task_queue队列为空，get失败')
            self.process_keyword(search_term,detail_id,title,price,url,image,sku)
            task_queue.task_done()


    def run(self,datas):
        task_queue = queue.Queue()
        for data in datas:
            search_term = data.split('|-|-|-|-|-|')[0]
            detail_id = data.split('|-|-|-|-|-|')[1]
            title = data.split('|-|-|-|-|-|')[2]
            price = data.split('|-|-|-|-|-|')[3]
            url = data.split('|-|-|-|-|-|')[4]
            image = data.split('|-|-|-|-|-|')[5]
            sku = data.split('|-|-|-|-|-|')[6]
            print(f'detail_id:{detail_id},url:{url}')
            try:
                task_queue.put((search_term, detail_id,title,price,url,image,sku))
            except Exception as exc:
                print('task_queue队列已满，写入失败')
        threads = []
        for th in range(5):
            t = threading.Thread(target=self.workers, args=(task_queue,))
            threads.append(t)
            t.start()

        for t in threads:
            t.join()

        all_items = []
        all_idds = []
        while not self.item_queue.empty():
            try:
                data = self.item_queue.get_nowait()  # 使用get_nowait避免阻塞，假设队列最终会为空
                item, idd = data
                all_items.append(item)
                all_idds.append(idd)
            except queue.Empty:
                pass  # 队列为空时正常结束循环

        try:
            Con.save_hd_details(all_items)
            self.success_queue.put(list(set(all_idds)))
        except Exception as e:
            print(f"保存时发生错误: {e}")
            self.failed_queue.put(list(set(all_idds)))

        all_success_ids = []
        all_failed_ids = []
        while not self.success_queue.empty():
            all_success_ids.append(self.success_queue.get())
        while not self.failed_queue.empty():
            all_failed_ids.append(self.failed_queue.get())
        self.update(all_success_ids, all_failed_ids)



def worker(start_id, limit):
    datas = Con.get_homedepot_url(start_id, limit)
    # datas = [
    #     '02 sensor downstream|-|-|-|-|-|1363|-|-|-|-|-|Oxygen Sensor - Downstream|-|-|-|-|-|46.08|-|-|-|-|-|https://www.homedepot.com/p/ACDelco-Oxygen-Sensor-Downstream-213-1702/308389157|-|-|-|-|-|https://images.thdstatic.com/productImages/88d15ef0-d334-4c03-9ada-59a3c464874a/svn/acdelco-fuel-systems-213-1702-64_100.jpg|-|-|-|-|-|308389157',
    #     # '02 sensor downstream|-|-|-|-|-|1364|-|-|-|-|-|Oxygen Sensor - Upstream|-|-|-|-|-|46.13|-|-|-|-|-|https://www.homedepot.com/p/ACDelco-Oxygen-Sensor-Upstream-213-3237/308387871|-|-|-|-|-|https://images.thdstatic.com/productImages/823505f2-2a29-4e6c-97c0-31a193b143ac/svn/acdelco-fuel-systems-213-3237-64_100.jpg|-|-|-|-|-|308387871'
    #     ]
    if datas:
        HD_details().run(datas)


def main():
    print(f'开始爬取---{datetime.datetime.now()}')
    for i in range(1, 1952):
        start_time = datetime.datetime.now()
        try:
            minid, maxid = Con.homedepot_details_ranges(i)
            if not minid or not maxid:  # 如果minid或maxid为空（假定这是判断无更多数据的标准）
                print(f'批次{i}已全部获取')
                break
        except Exception as e:
            print(f'第{i}批次获取范围时出错: {e}, 跳过此批次...')
            continue  # 出现异常时跳过此次循环，进入下一个批次

        print(f'第{i}批次', minid, maxid)

        num_processes = 5
        batch_size = 101
        p = Pool(num_processes)
        data_range = []
        start_id = int(minid)
        while start_id <= int(maxid):
            end_id = min(start_id + batch_size - 1, int(maxid))  # 防止超出最大ID,
            data_range.append((start_id, end_id - start_id + 1))  # 计算每个范围的行数
            start_id = end_id + 1  # 更新起始ID为下一个范围的起始ID

        for start_id, limit in data_range:
            p.apply_async(worker, args=(start_id, limit))
        p.close()
        p.join()

        current_time = datetime.datetime.now()
        elapsed_time = current_time - start_time
        print(f'第{i}批次爬取完毕，耗时：{elapsed_time}')

    print('全部爬取完毕')

if __name__ == "__main__":
    main()
    # worker(1363, 2)














