tf_dome1.py


import tensorflow as tf
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import psycopg2
import pandas as pd


"""
一个简单的文本分类模型，用于预测商品是否与圣诞节相关 
文本：    ASIN、站点、标题、描述、卖点
子文本（买家评论）：  
图片/视频：ASIN、站点、首图、子图、A+、视频

大数据模型训练流程 
1.处理大量数据，使用 hadoop、spark 等计算框架来处理、分析、挖掘这些数据。  涉及到：存储、清洗、分析、等方面
2.下面的案例更侧重于机器学习和深度学习的应用，以及使用TensorFlow和Keras构建文本分类模型
"""

#用于判断产品是否与圣诞相关。
def is_christmas_product(row):
    keywords = ['christmas', 'christmas tree', 'christmas gifts', 'christmas decorations', 'christmas celebration',
                'santa claus', 'winter holiday', 'christmas shopping', 'holiday party']
    title = str(row['title']).lower() if pd.notna(row['title']) else ''
    describe = str(row['describe']).lower() if pd.notna(row['describe']) else ''
    product_description = str(row['product_description']).lower() if pd.notna(row['product_description']) else ''
    return any((keyword in title or keyword in describe or keyword in product_description) for keyword in keywords)

#用于判断产品是否包含圣诞装饰相关关键词
def has_christmas_decorations(row):
    keywords = ['decorations', 'ornaments', 'festive decor', 'holiday decor', 'christmas adornments']
    return any(keyword in str(row['title']).lower() or keyword in str(row['describe']).lower() or keyword in str(row['product_description']).lower() for keyword in keywords)

# 定义判断条件的函数
def is_condition_met(row):
    return 'keyword' in str(row['title']) and 'keyword' in str(row['product_description']) and 'keyword' in str(row['describe'])


# 连接 PostgreSQL 数据库
connection = psycopg2.connect(
    host="192.168.10.223",
    port="5432",
    database="selection",
    user="postgres",
    password="fazAqRRVV9vDmwDNRNb593ht5TxYVrfTyHJSJ3BS"
)

# 从数据库读取数据
query = "select  asin,title,product_description,describe  from us_asin_detail_month_2023_12  where date_info ='2023-12'  and spider_int = 1  and product_description !=''   limit 1000"
df = pd.read_sql(query, connection)

# 关闭数据库连接
connection.close()

# 过滤掉空数据：通过筛选，保留 title、product_description 或 describe 字段中不为空的数据。
df = df[(df['title'].notna() & (df['title'] != '')) |
        (df['product_description'].notna() & (df['product_description'] != '')) |
        (df['describe'].notna() & (df['describe'] != ''))]

# 将满足条件的数据标记为 True
df['meets_condition'] = df.apply(is_condition_met, axis=1)

#添加新的特征列
df['is_christmas'] = df.apply(is_christmas_product, axis=1)


#划分训练集和测试集
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)
train_df['is_christmas'] = train_df.apply(is_christmas_product, axis=1).fillna(False)

# 在数据集中添加特征列
df['has_christmas_decorations'] = df.apply(has_christmas_decorations, axis=1)

# 使用训练集的一部分作为测试集
test_df = train_df.sample(frac=1, random_state=42)
test_df['is_christmas'] = test_df.apply(lambda row: is_christmas_product(row) if pd.notna(row['title']) or pd.notna(row['product_description']) or pd.notna(row['describe']) else False, axis=1)

# 合并文本字段
train_texts = train_df['describe'] + ' ' + train_df['title'] + ' ' + train_df['product_description']
test_texts = test_df['describe'] + ' ' + test_df['title'] + ' ' + test_df['product_description']

train_texts = train_texts.fillna('')  # 将 NaN 值替换为空字符串
train_texts = train_texts.dropna()
test_texts = test_texts.astype(str)

# 使用Tokenizer构建词汇表
tokenizer = Tokenizer()
tokenizer.fit_on_texts(train_texts)

# 将文本序列转化为数字序列
train_sequences = tokenizer.texts_to_sequences(train_texts)

test_sequences = tokenizer.texts_to_sequences(test_texts)

# 填充序列保证长度一致
max_length = 150  # 你可以根据实际情况调整
train_padded = pad_sequences(train_sequences, maxlen=max_length, padding='post', truncating='post')
test_padded = pad_sequences(test_sequences, maxlen=max_length, padding='post', truncating='post')


#构建模型： 使用TensorFlow和Keras构建一个简单的深度学习文本分类模型
#包括Embedding层:是一种用于将高纬度数据（如文本、图片、音频）映射到低维度空间的机器学习方法。这种模型通过创建一个由实数构成的多维向量来表示输入数据，使得每个数据点都对应于一个连续数值空间中的一个具体位置。
# Embedding向量可以用于捕捉数据的结构化信息，尤其是在自然语言处理（NLP）和计算机视觉、等领域，它们帮助算法理解不同类型数据之间的相互关系
# GlobalAveragePooling1D 层:
# Dense层:是一个全连接层
model = tf.keras.Sequential([
    #input_dim： 表示词汇表中的单词数目，加 1 是因为词汇表的索引是从 1 开始的，而不是从 0。
    #output_dim：表示嵌入向量的维度，即每个单词将被映射为一个具有维度的向量
    #input_length：表示输入序列的长度。指定的文本序列的最大长度
    tf.keras.layers.Embedding(input_dim=len(tokenizer.word_index) + 1, output_dim=100, input_length=max_length),
    #GlobalAveragePooling1D：用于将每个时间步的输出平均汇聚成一个固定长度的向量。在文本分类任务中，它通常用于处理变长序列，将不同长度的文本转换为固定长度的表示
    tf.keras.layers.GlobalAveragePooling1D(),
    #tf.keras.layers.Dense(128, activation='relu'),  # 调整神经元数量和激活函数
    #是一个全连接层，其中包含 128 个神经元，并使用双曲正切（tanh）作为激活函数
    tf.keras.layers.Dense(128, activation='tanh'),
    #tf.keras.layers.Dense(128, activation='relu', kernel_regularizer=regularizers.l2(0.01)),  # 添加 L2 正则化
    # Dense：这是神经网络的输出层，包含一个神经元。激活函数选择了 sigmoid。在二分类问题中，sigmoid 函数常用于输出层，因为它将输出值映射到 0 到 1 之间，可以看作是概率值，表示样本属于正类别的概率。
    #这两层分开定义的目的是为了能够更灵活地定制神经网络的架构。如果需要对中间层进行调整，添加更多的隐藏层或者更改神经元数量，可以在这两层之间插入新的全连接层。最后一层的选择和设置是根据问题类型和模型的输出进行调整的
    # 因为该案例：是一个二分类问题，因此使用了 sigmoid 激活函数。
    tf.keras.layers.Dense(1, activation='sigmoid')
])

# 调整学习率
optimizer = tf.keras.optimizers.Adam(learning_rate=0.001)

# 编译模型: optimizer优化器， adam是一种常用的优化算法， 该算法就是综合其他优化算法的优点
#loss（损失函数）： 用于衡量模型在训练过程中预测值与真实值之间的差异。在二分类问题中，一般使用 'binary_crossentropy' 作为损失函数。对于多分类问题，可能会选择 'categorical_crossentropy'。
#metrics（评估指标）： 用于在训练和评估过程中监控模型性能的指标。在这里，我们使用 'accuracy'，表示模型在训练和评估过程中将准确率作为一个监测指标。
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# 训练模型
model.fit(train_padded, train_df['is_christmas'], epochs=5, validation_data=(test_padded, test_df['is_christmas']))


#保存模型结构和权重
model.save('text_classification_model.h5')

#使用模型进行预测
predictions = model.predict(test_padded)

#设置阈值
threshold = 0.5

# 存储匹配的 ASIN 和字段信息
matching_asins_info = []

# 遍历测试集中的每个样本
for i in range(len(test_df)):
    # 获取测试样本的信息
    asin = test_df["asin"].iloc[i]
    is_christmas_label = test_df["is_christmas"].iloc[i]

    # 如果模型预测为与圣诞节相关
    if predictions[i][0] > threshold:
        print('asin:',asin)
        # 记录匹配的 ASIN 和字段信息
        matching_asins_info.append({
            'ASIN': asin,
            'IsChristmasLabel': is_christmas_label,
            'PredictedLabel': 1,  # 1 表示预测为相关
            'Probabilities': predictions[i],
            'MatchingFields': {
                'Title': test_df['title'].iloc[i],
                'Description': test_df['describe'].iloc[i],
                'ProductDescription': test_df['product_description'].iloc[i]
            }
        })


output_file_path = 'E:/BaiduNetdiskDownload/选品大数据/推荐系统/matching_asins_info.txt'
with open(output_file_path, 'w', encoding='utf-8') as output_file:
    for info in matching_asins_info:
        output_file.write(f"ASIN: {info['ASIN']}\n")
        output_file.write(f"Is Christmas Label: {info['IsChristmasLabel']}\n")

        matched_field = 'Title'  # 默认为 Title
        if info['MatchingFields']['Description']:
            matched_field = 'Description'
        elif info['MatchingFields']['ProductDescription']:
            matched_field = 'Product Description'

        output_file.write(f"Matched Field: {matched_field}\n")
        output_file.write(f"Matched Content: {info['MatchingFields'][matched_field]}\n")
        output_file.write('-' * 50 + '\n')