

from multiprocessing import Pool
from datetime import datetime, timedelta
from all_connect import ConnectSpider
Con = ConnectSpider()
from lxml import etree
import requests
import json
import datetime
from urllib.parse import quote
import threading
import queue
from queue import Queue



class HomeDepot:
    def __init__(self):
        self.headers = {
            'Referer': 'https://www.homedepot.com/s/02%20sensor%20downstream?NCNI-5',
            'Upgrade-Insecure-Requests': '1',
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/127.0.0.0 Safari/537.36',
            'sec-ch-ua': '"Not)A;Brand";v="99", "Google Chrome";v="127", "Chromium";v="127"',
            'sec-ch-ua-mobile': '?0',
            'sec-ch-ua-platform': '"Windows"',
        }
        self.cookies = {
            'AMCV_F6421253512D2C100A490D45%40AdobeOrg': 'MCMID|16129746417315936432497630021679731371',
        }
        self.item_queue = Queue()
        self.success_queue = Queue()
        self.failed_queue = Queue()
        self.invalid_queue = Queue()

    def fetch_page(self, keyword, page,search_id):
        encoded_keyword = quote(keyword)
        url = f'https://www.homedepot.com/s/{encoded_keyword}?NCNI-5&Nao={page}'
        try:
            response = requests.get(url, headers=self.headers,cookies= self.cookies)
            if response.status_code == 200:
                return response.text
            else:
                print(f"Error fetching page {page}: Status code {response.status_code}")
                return None
        except Exception as e:
            print(f"请求失败: {e}")
            if "HTTPSConnectionPool(host='homedepot.com', port=443)" in str(e):
                try:
                    self.invalid_queue.put([search_id])
                    print(f'将无效id:{search_id} 放入失效队列')
                except Exception as e:
                    print('invalid_queue队列已满，写入失败')
                return None

    def parse_html(self, html_content,search_id,page):
        Html = etree.HTML(html_content)
        try:
            try:
                # 先将无效搜索词放入失效队列
                not_find = Html.xpath('//*[@id="root"]/div/div/div/div[2]/div/div/div/span[1]/text()')[0]
                print(not_find)
                if "Hmm...we couldn't find" in not_find and page ==0:
                    try:
                        self.invalid_queue.put([search_id])
                        print(f'将无效id:{search_id} 放入失效队列')
                    except Exception as e:
                        print('invalid_queue队列已满，写入失败')
            except:
                # print(Html.xpath('//script[@id="thd-helmet__script--browseSearchStructuredData"]/text()'))
                if "thd-helmet__script--browseSearchStructuredData" in html_content:
                    data_list = json.loads(Html.xpath('//script[@id="thd-helmet__script--browseSearchStructuredData"]/text()')[0])
                    try:
                        if isinstance(data_list, list) and len(data_list) > 0:
                            datas = data_list[0]['mainEntity']['offers']['itemOffered']
                            if datas:
                                return data_list[0]['mainEntity']['offers']['itemOffered']
                            else:
                                if page == 0:
                                    try:
                                        self.invalid_queue.put([search_id])
                                        print(f'将无效id:{search_id} 放入失效队列')
                                    except Exception as e:
                                        print('invalid_queue队列已满，写入失败')
                                else:
                                    return []
                        else:
                                return []
                    except: #另一种解析页面
                        # print(html_content)
                        data_list = Html.xpath('//section[@id="browse-search-pods1"]')
                        for i in data_list:
                            print(111111)
                            url = ['https://www.homedepot.com'+ href for href in i.xpath('./div/div/div/div/div/a/@href')]
                            print(url)
                            image = i.xpath('./div/div/div/div/div/a/div/img/@src')
                            print(image)
                            title = i.xpath('./div/div/div/div/div/a/div/img/@alt')
                            print(title)


                else:
                    try:
                        self.invalid_queue.put([search_id])
                        print(f'将无效id:{search_id} 放入失效队列')
                    except Exception as e:
                        print('invalid_queue队列已满，写入失败')

        except Exception as e:
            print(f"Parsing error: {e}")
            return []
            # try:
            #     self.invalid_queue.put([search_id])
            #     print(f'将无效id:{search_id} 放入失效队列')
            # except Exception as e:
            #     print('invalid_queue队列已满，写入失败')



    def save_data(self, project_list, keyword,search_id):
        for project in project_list:
            item = {}
            # 使用 .get() 方法并提供默认值以避免 KeyError
            item['title'] = project.get('name', '')
            offers = project.get('offers', {})
            price = offers.get('price')

            if isinstance(price, str):
                # 如果价格是字符串，则尝试将其转换为浮点数
                try:
                    # 尝试将价格转换为浮点数
                    price = float(price) if price.replace('.', '', 1).isdigit() or (
                            price.count('.') == 1 and price.replace('.', '', 1).isdigit()) else None
                except ValueError:
                    # 如果转换失败，设置为 None
                    price = None
            elif not isinstance(price, (int, float)):
                # 如果既不是字符串也不是数值类型，则设置为 None
                price = None

            item['price'] = price
            item['url'] = offers.get('url', '')
            item['image'] = project.get('image', '')
            item['sku'] = project.get('sku', '')
            item['search_term'] = keyword
            item['state'] = 1
            item['created_at'] = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')
            data = (item, search_id)
            try:
                self.item_queue.put(data)
            except Exception as exc:
                print('item_queue队列已满，写入失败')
        #     data_list.append(item)
        # print(data_list)



    def process_keyword(self, search_term,search_id):
        max_pages = 3  # 只爬取前三页
        for page in range(0, max_pages * 24, 24):  # 每次增加24
            print(f"爬取{search_term}第{page // 24 + 1}页...")
            html_content = self.fetch_page(search_term, page,search_id)

            if html_content is None:
                break

            project_list = self.parse_html(html_content,search_id,page)
            if not project_list:
                print(f"{search_term} Reached the last page.")
                break
            else:
                self.save_data(project_list, search_term,search_id)

    def update(self, success_id, failed_id,invalid_id):
        try:
            if success_id[0]:
                print(f'成功的id:{success_id}')
                Con.homedepot_upstate_to_3(success_id)
                print("所有数据处理完成，状态已更新为3。")

            if failed_id:
                print(f'失败的id:{failed_id}')
                Con.homedepot_upstate_to_4(failed_id)
                print("所有数据处理完成，状态已更新为4。")

            if invalid_id:
                invalid_id = [item for sublist in invalid_id for item in sublist]
                print(f'无效的id:{invalid_id}')
                Con.homedepot_upstate_to_5(invalid_id)
                print("所有数据处理完成，状态已更新为5。")


        except Exception as e:
            print(f"更新状态时发生错误: {e}")


    def workers(self, task_queue):
        while not task_queue.empty():
            try:
                search_term,search_id = task_queue.get(timeout=10)
            except Exception as e:
                print('task_queue队列为空，get失败')
            self.process_keyword(search_term,search_id)
            task_queue.task_done()

    def run(self,search_term_id_pairs):
        task_queue = queue.Queue()
        for search_term_id in search_term_id_pairs:
            search_term = search_term_id.split('|-|-|-|-|-|')[0]
            search_id = search_term_id.split('|-|-|-|-|-|')[1]
            print(f'搜索词:{search_term},id:{search_id}')
            try:
                task_queue.put((search_term, search_id))
            except Exception as exc:
                print('task_queue队列已满，写入失败')
        threads = []
        for th in range(5):
            t = threading.Thread(target=self.workers, args=(task_queue,))
            threads.append(t)
            t.start()

        for t in threads:
            t.join()

        all_items = []
        all_idds = []
        while not self.item_queue.empty():
            try:
                data = self.item_queue.get_nowait()  # 使用get_nowait避免阻塞，假设队列最终会为空
                item, idd = data
                all_items.append(item)
                all_idds.append(idd)
            except queue.Empty:
                pass  # 队列为空时正常结束循环

        try:
            Con.save_project_items(all_items)
            self.success_queue.put(list(set(all_idds)))
        except Exception as e:
            print(f"保存时发生错误: {e}")
            self.failed_queue.put(list(set(all_idds)))

        all_success_ids = []
        all_failed_ids = []
        all_invalid_ids = []
        while not self.success_queue.empty():
            all_success_ids.append(self.success_queue.get())
        while not self.failed_queue.empty():
            all_failed_ids.append(self.failed_queue.get())
        while not self.invalid_queue.empty():
            all_invalid_ids.append(self.invalid_queue.get())

        self.update(all_success_ids, all_failed_ids, all_invalid_ids)



def worker(start_id, limit):
    # search_term_id_pairs = Con.get_homedepot_search_terms(start_id, limit)
    # print(f'{start_id}"-[][][]-"{start_id+limit},{search_term_id_pairs}')
    search_term_id_pairs = ['all in one washer dryer combo|-|-|-|-|-|78402']
    if search_term_id_pairs:
        HomeDepot().run(search_term_id_pairs)


def main():
    start_time = datetime.datetime.now()
    print(f'开始爬取---{start_time}')
    for i in range(1, 51):
        try:
            minid, maxid = Con.homedepot_ranges(i)
            if not minid or not maxid:  # 如果minid或maxid为空（假定这是判断无更多数据的标准）
                print(f'批次{i}已全部获取')
                break
        except Exception as e:
            print(f'第{i}批次获取范围时出错: {e}, 跳过此批次...')
            continue  # 出现异常时跳过此次循环，进入下一个批次

        print(f'第{i}批次', minid, maxid)

        num_processes = 5
        batch_size = 101
        p = Pool(num_processes)
        data_range = []
        start_id = int(minid)
        while start_id <= int(maxid):
            end_id = min(start_id + batch_size - 1, int(maxid))  # 防止超出最大ID,
            data_range.append((start_id, end_id - start_id + 1))  # 计算每个范围的行数
            start_id = end_id + 1  # 更新起始ID为下一个范围的起始ID

        for start_id, limit in data_range:
            # print(start_id, limit)
            p.apply_async(worker, args=(start_id, limit))
        p.close()
        p.join()

        current_time = datetime.datetime.now()
        elapsed_time = current_time - start_time
        print(f'第{i}批次爬取完毕，耗时：{elapsed_time}')

    print('全部爬取完毕')

# if __name__ == "__main__":
#     # main()
#     worker(79304, 1)




