Skip to content
Projects
Groups
Snippets
Help
This project
Loading...
Sign in / Register
Toggle navigation
S
spider
Overview
Overview
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
selection-new
spider
Commits
60a70d74
Commit
60a70d74
authored
Feb 14, 2026
by
Peng
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
no message
parent
efc3b2e1
Show whitespace changes
Inline
Side-by-side
Showing
1 changed file
with
10 additions
and
16 deletions
+10
-16
requests_param.py
py_spider/utils/requests_param.py
+10
-16
No files found.
py_spider/utils/requests_param.py
View file @
60a70d74
...
@@ -8,7 +8,7 @@ import sys
...
@@ -8,7 +8,7 @@ import sys
import
time
import
time
import
uuid
import
uuid
from
urllib.parse
import
urlparse
from
urllib.parse
import
urlparse
from
threading
import
Lock
import
urllib3
import
urllib3
from
lxml
import
etree
from
lxml
import
etree
...
@@ -35,8 +35,6 @@ class Requests_param_val(BaseUtils):
...
@@ -35,8 +35,6 @@ class Requests_param_val(BaseUtils):
print
(
"站点名称:"
,
self
.
site_name
,
'抓取项目'
,
"代理ip:"
,
self
.
proxy_name
)
print
(
"站点名称:"
,
self
.
site_name
,
'抓取项目'
,
"代理ip:"
,
self
.
proxy_name
)
self
.
cookies_queue
=
Queue
()
# cookie队列
self
.
cookies_queue
=
Queue
()
# cookie队列
self
.
kafuka_producer_str
=
self
.
kafuka_connect
()
self
.
kafuka_producer_str
=
self
.
kafuka_connect
()
self
.
next_page_lock
=
Lock
()
self
.
headers_num_int_s
=
0
def
init_db_names
(
self
):
def
init_db_names
(
self
):
self
.
engine_pg
=
self
.
pg_connect
()
self
.
engine_pg
=
self
.
pg_connect
()
...
@@ -353,7 +351,6 @@ class Requests_param_val(BaseUtils):
...
@@ -353,7 +351,6 @@ class Requests_param_val(BaseUtils):
time
.
sleep
(
15
)
time
.
sleep
(
15
)
def
hex_md5
(
self
,
input_string
):
def
hex_md5
(
self
,
input_string
):
# 要加密的字符串
# 创建一个MD5哈希对象
# 创建一个MD5哈希对象
md5_hash
=
hashlib
.
md5
()
md5_hash
=
hashlib
.
md5
()
# 使用输入字符串的字节更新哈希对象
# 使用输入字符串的字节更新哈希对象
...
@@ -368,28 +365,26 @@ class Requests_param_val(BaseUtils):
...
@@ -368,28 +365,26 @@ class Requests_param_val(BaseUtils):
def
on_send_error
(
self
,
excp
):
def
on_send_error
(
self
,
excp
):
print
(
"消息发送失败"
,
excp
)
print
(
"消息发送失败"
,
excp
)
def
send_kafka
(
self
,
items
=
None
,
html_data
=
None
,
topic
=
None
,
num
=
3
):
def
send_kafka
(
self
,
items
=
None
,
html_data
=
None
,
topic
=
None
):
print
(
'向Kafka发送数据'
)
print
(
'向Kafka发送数据'
)
for
i
in
range
(
5
):
for
i
in
range
(
5
):
try
:
try
:
if
items
:
if
items
:
print
(
'232323232323'
)
del
items
[
'div_id_list'
]
del
items
[
'div_id_list'
]
future
=
self
.
kafuka_producer_str
.
send
(
topic
,
json
.
dumps
(
items
))
future
=
self
.
kafuka_producer_str
.
send
(
topic
,
json
.
dumps
(
items
))
future
.
add_callback
(
self
.
on_send_success
)
.
add_errback
(
self
.
on_send_error
)
future
.
add_callback
(
self
.
on_send_success
)
.
add_errback
(
self
.
on_send_error
)
future
.
get
(
30
)
if
html_data
:
if
html_data
:
future
=
self
.
kafuka_producer_str
.
send
(
topic
,
html_data
)
future
=
self
.
kafuka_producer_str
.
send
(
topic
,
html_data
)
future
.
add_callback
(
self
.
on_send_success
)
.
add_errback
(
self
.
on_send_error
)
future
.
add_callback
(
self
.
on_send_success
)
.
add_errback
(
self
.
on_send_error
)
future
.
get
(
30
)
print
(
'向Kafka发送数据 发送成功'
)
print
(
'向Kafka发送数据 发送成功'
)
with
self
.
next_page_lock
:
self
.
headers_num_int_s
+=
1
if
self
.
headers_num_int_s
%
10
==
0
:
self
.
kafuka_producer_str
.
flush
()
break
break
except
Exception
as
e
:
except
Exception
as
e
:
print
(
f
"kafka发送失败(第{i + 1}/5次)"
,
e
)
print
(
e
)
time
.
sleep
(
2
)
if
i
>=
1
:
if
i
>=
1
and
i
%
2
==
1
:
self
.
kafuka_producer_str
=
self
.
kafuka_connect
()
# 调用kafka
self
.
kafuka_producer_str
=
self
.
kafuka_connect
(
acks
=
True
)
try
:
\ No newline at end of file
self
.
kafuka_producer_str
.
flush
(
timeout
=
30
)
except
KafkaTimeoutError
as
e
:
print
(
"flush 超时,跳过这次等待:"
,
e
)
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment