原创文章,欢迎转载。转载请注明:转载自 IT人故事会,谢谢!
原文链接地址: 「docker实战篇」python的docker爬虫技术-python脚本app抓取(13)
上次已经分析出来具体的app的请求连接了,本次主要说说python的开发,抓取APP里面的信息。源码: https://github.com/limingios/dockerpython.git

分析app数据包
查看分析

解析出来的header

夜神配置

python代码,爬取分类

#!/usr/bin/env python
# -*- coding: utf-8 -*-
# @Time : 2019/1/9 11:06
# @Author : lm
# @Url : idig8.com
# @Site :
# @File : spider_douguomeishi.py
# @Software: PyCharm
import requests
#header内容比较多,因为各个厂家的思路不同,
#fiddler爬取出来的字段比较多,有些内容应该是非必填的,只能在实际的时候尝试注释一些来试。
def
handle_request
(url,data): header ={
"client":
"4",
"version":
"6916.2",
"device":
"SM-G955N",
"sdk":
"22,5.1.1",
"imei":
"354730010002552",
"channel":
"zhuzhan",
"mac":
"00:FF:E2:A2:7B:58",
"resolution":
"1440*900",
"dpi":
"2.0",
"android-id":
"bcdaf527105cc26f",
"pseudo-id":
"354730010002552",
"brand":
"samsung",
"scale":
"2.0",
"timezone":
"28800",
"language":
"zh",
"cns":
"3",
"carrier":
"Android",
#"imsi": "310260000000000",
"user-agent":
"Mozilla/5.0 (Linux; Android 5.1.1; SM-G955N Build/NRD90M) AppleWebKit/537.36 (KHTML, like Gecko) Version/4.0 Chrome/39.0.0.0 Mobile Safari/537.36",
"lon":
"105.566938",
"lat":
"29.99831",
"cid":
"512000",
"Content-Type":
"application/x-www-form-urlencoded; charset=utf-8",
"Accept-Encoding":
"gzip, deflate",
"Connection":
"Keep-Alive",
# "Cookie": "duid=58349118",
"Host":
"api.douguo.net",
#"Content-Length": "65" }
response = requests.post(url=url,headers=header,data=data)
return response
def
handle_index
(): url =
"http://api.douguo.net/recipe/flatcatalogs"
# client=4&_session=1547000257341354730010002552&v=1503650468&_vs=0 data ={
"client":
"4",
"_session":
"1547000257341354730010002552",
"v":
"1503650468",
"_vs":
"0" }
response = handle_request(url,data)
print(response.text)
handle_index()

爬取详情,信息通过分类找到里面的详情


#!/usr/bin/env python
# -*- coding: utf-8 -*-
# @Time : 2019/1/9 11:06
# @Author : lm
# @Url : idig8.com
# @Site :
# @File : spider_douguomeishi.py
# @Software: PyCharm
import json
import requests
from multiprocessing
import Queue
#创建队列 queue_list = Queue()
def
handle_request
(url,data): header ={
"client":
"4",
"version":
"6916.2",
"device":
"SM-G955N",
"sdk":
"22,5.1.1",
"imei":
"354730010002552",
"channel":
"zhuzhan",
"mac":
"00:FF:E2:A2:7B:58",
"resolution":
"1440*900",
"dpi":
"2.0",
"android-id":
"bcdaf527105cc26f",
"pseudo-id":
"354730010002552",
"brand":
"samsung",
"scale":
"2.0",
"timezone":
"28800",
"language":
"zh",
"cns":
"3",
"carrier":
"Android",
#"imsi": "310260000000000",
"user-agent":
"Mozilla/5.0 (Linux; Android 5.1.1; SM-G955N Build/NRD90M) AppleWebKit/537.36 (KHTML, like Gecko) Version/4.0 Chrome/39.0.0.0 Mobile Safari/537.36",
"lon":
"105.566938",
"lat":
"29.99831",
"cid":
"512000",
"Content-Type":
"application/x-www-form-urlencoded; charset=utf-8",
"Accept-Encoding":
"gzip, deflate",
"Connection":
"Keep-Alive",
# "Cookie": "duid=58349118",
"Host":
"api.douguo.net",
#"Content-Length": "65" }
response = requests.post(url=url,headers=header,data=data)
return response
def
handle_index
(): url =
"http://api.douguo.net/recipe/flatcatalogs"
# client=4&_session=1547000257341354730010002552&v=1503650468&_vs=0 data ={
"client":
"4",
"_session":
"1547000257341354730010002552",
"v":
"1503650468",
"_vs":
"0" }
response = handle_request(url,data)
# print(response.text) index_response_dic = json.loads(response.text)
for item_index
in index_response_dic[
"result"][
"cs"]:
# print(item_index)
for item_index_cs
in item_index[
"cs"]:
# print(item_index_cs)
for item
in item_index_cs[
"cs"]:
#print(item) data_2 ={
"client":
"4",
"_session":
"1547000257341354730010002552",
"keyword":item[
"name"],
"_vs ":
"400" }
#print(data_2) queue_list.put(data_2)
handle_index()
print(queue_list.qsize())

分类菜谱内部的详情信息
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# @Time : 2019/1/9 11:06
# @Author : lm
# @Url : idig8.com
# @Site :
# @File : spider_douguomeishi.py
# @Software: PyCharm
import json
import requests
from multiprocessing
import Queue
#创建队列 queue_list = Queue()
def
handle_request
(url,data): header ={
"client":
"4",
"version":
"6916.2",
"device":
"SM-G955N",
"sdk":
"22,5.1.1",
"imei":
"354730010002552",
"channel":
"zhuzhan",
"mac":
"00:FF:E2:A2:7B:58",
"resolution":
"1440*900",
"dpi":
"2.0",
"android-id":
"bcdaf527105cc26f",
"pseudo-id":
"354730010002552",
"brand":
"samsung",
"scale":
"2.0",
"timezone":
"28800",
"language":
"zh",
"cns":
"3",
"carrier":
"Android",
#"imsi": "310260000000000",
"user-agent":
"Mozilla/5.0 (Linux; Android 5.1.1; SM-G955N Build/NRD90M) AppleWebKit/537.36 (KHTML, like Gecko) Version/4.0 Chrome/39.0.0.0 Mobile Safari/537.36",
"lon":
"105.566938",
"lat":
"29.99831",
"cid":
"512000",
"Content-Type":
"application/x-www-form-urlencoded; charset=utf-8",
"Accept-Encoding":
"gzip, deflate",
"Connection":
"Keep-Alive",
# "Cookie": "duid=58349118",
"Host":
"api.douguo.net",
#"Content-Length": "65" }
response = requests.post(url=url,headers=header,data=data)
return response
def
handle_index
(): url =
"http://api.douguo.net/recipe/flatcatalogs"
# client=4&_session=1547000257341354730010002552&v=1503650468&_vs=0 data ={
"client":
"4",
"_session":
"1547000257341354730010002552",
"v":
"1503650468",
"_vs":
"0" }
response = handle_request(url,data)
# print(response.text) index_response_dic = json.loads(response.text)
for item_index
in index_response_dic[
"result"][
"cs"]:
# print(item_index)
for item_index_cs
in item_index[
"cs"]:
# print(item_index_cs)
for item
in item_index_cs[
"cs"]:
#print(item) data_2 ={
"client":
"4",
#"_session":"1547000257341354730010002552",
"keyword":item[
"name"],
"_vs ":
"400",
"order":
"0" }
#print(data_2) queue_list.put(data_2)
def
handle_caipu_list
(data): print(
"当前的食材:",data[
"keyword"])
caipu_list_url =
"http://api.douguo.net/recipe/s/0/20";
caipu_response = handle_request(caipu_list_url, data)
caipu_response_dict = json.loads(caipu_response.text)
for caipu_item
in caipu_response_dict[
"result"][
"list"]:
caipu_info ={}
caipu_info[
"shicai"] = data[
"keyword"]
if caipu_item[
"type"]==
13:
caipu_info[
"user_name"] = caipu_item[
"r"][
"an"]
caipu_info[
"shicai_id"] = caipu_item[
"r"][
"id"]
caipu_info[
"describe"] = caipu_item[
"r"][
"cookstory"].replace(
"\n",
"").replace(
" ",
"")
caipu_info[
"caipu_name"] = caipu_item[
"r"][
"n"]
caipu_info[
"zuoliao_list"] = caipu_item[
"r"][
"major"]
print(caipu_info)
else:
continue handle_index()
handle_caipu_list(queue_list.get())

菜品内部的详情信息


#!/usr/bin/env python
# -*- coding: utf-8 -*-
# @Time : 2019/1/9 11:06
# @Author : lm
# @Url : idig8.com
# @Site :
# @File : spider_douguomeishi.py
# @Software: PyCharm
import json
import requests
from multiprocessing
import Queue
#创建队列 queue_list = Queue()
def
handle_request
(url,data): header ={
"client":
"4",
"version":
"6916.2",
"device":
"SM-G955N",
"sdk":
"22,5.1.1",
"imei":
"354730010002552",
"channel":
"zhuzhan",
"mac":
"00:FF:E2:A2:7B:58",
"resolution":
"1440*900",
"dpi":
"2.0",
"android-id":
"bcdaf527105cc26f",
"pseudo-id":
"354730010002552",
"brand":
"samsung",
"scale":
"2.0",
"timezone":
"28800",
"language":
"zh",
"cns":
"3",
"carrier":
"Android",
#"imsi": "310260000000000",
"user-agent":
"Mozilla/5.0 (Linux; Android 5.1.1; SM-G955N Build/NRD90M) AppleWebKit/537.36 (KHTML, like Gecko) Version/4.0 Chrome/39.0.0.0 Mobile Safari/537.36",
"lon":
"105.566938",
"lat":
"29.99831",
"cid":
"512000",
"Content-Type":
"application/x-www-form-urlencoded; charset=utf-8",
"Accept-Encoding":
"gzip, deflate",
"Connection":
"Keep-Alive",
# "Cookie": "duid=58349118",
"Host":
"api.douguo.net",
#"Content-Length": "65" }
response = requests.post(url=url,headers=header,data=data)
return response
def
handle_index
(): url =
"http://api.douguo.net/recipe/flatcatalogs"
# client=4&_session=1547000257341354730010002552&v=1503650468&_vs=0 data ={
"client":
"4",
"_session":
"1547000257341354730010002552",
"v":
"1503650468",
"_vs":
"0" }
response = handle_request(url,data)
# print(response.text) index_response_dic = json.loads(response.text)
for item_index
in index_response_dic[
"result"][
"cs"]:
# print(item_index)
for item_index_cs
in item_index[
"cs"]:
# print(item_index_cs)
for item
in item_index_cs[
"cs"]:
#print(item) data_2 ={
"client":
"4",
#"_session":"1547000257341354730010002552",
"keyword":item[
"name"],
"_vs ":
"400",
"order":
"0" }
#print(data_2) queue_list.put(data_2)
def
handle_caipu_list
(data): print(
"当前的食材:",data[
"keyword"])
caipu_list_url =
"http://api.douguo.net/recipe/s/0/20";
caipu_response = handle_request(caipu_list_url, data)
caipu_response_dict = json.loads(caipu_response.text)
for caipu_item
in caipu_response_dict[
"result"][
"list"]:
caipu_info ={}
caipu_info[
"shicai"] = data[
"keyword"]
if caipu_item[
"type"]==
13:
caipu_info[
"user_name"] = caipu_item[
"r"][
"an"]
caipu_info[
"shicai_id"] = caipu_item[
"r"][
"id"]
caipu_info[
"describe"] = caipu_item[
"r"][
"cookstory"].replace(
"\n",
"").replace(
" ",
"")
caipu_info[
"caipu_name"] = caipu_item[
"r"][
"n"]
caipu_info[
"zuoliao_list"] = caipu_item[
"r"][
"major"]
#print(caipu_info) detail_url =
"http://api.douguo.net/recipe/detail/"+ str(caipu_info[
"shicai_id"])
detail_data ={
"client":
"4",
"_session":
"1547000257341354730010002552",
"author_id":
"0",
"_vs":
"2803",
"ext":
'{"query": {"kw": "'+data[
"keyword"]+
'", "src": "2803", "idx": "1", "type": "13", "id": '+str(caipu_info[
"shicai_id"])+
'}}' }
detail_reponse = handle_request(detail_url,detail_data)
detail_reponse_dic = json.loads(detail_reponse.text)
caipu_info[
"tips"] = detail_reponse_dic[
"result"][
"recipe"][
"tips"]
caipu_info[
"cookstep"] = detail_reponse_dic[
"result"][
"recipe"][
"cookstep"]
print(json.dumps(caipu_info))
else:
continue handle_index()
handle_caipu_list(queue_list.get())
将数据保存在mongodb中
- 通过vagrant 安装虚拟机
vagrant up
- 进入虚拟机
ip 192.168.66.100

su -
#密码:vagrant docker

- 拉取mongodb的镜像
https://hub.docker.com/r/bitnami/mongodb
默认端口:27017
docker pull bitnami/mongodb:latest

- 创建mongodb的容器
mkdir bitnami
cd bitnami
mkdir mongodb
docker run -d -v /path/to/mongodb-persistence:/root/bitnami -p 27017:27017 bitnami/mongodb:latest
#关闭防火墙 systemctl stop firewalld
用第三方工具连接

连接mongodb的工具
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# @Time : 2019/1/11 0:53
# @Author : liming
# @Site :
# @File : handle_mongodb.py
# @url : idig8.com
# @Software: PyCharm
import pymongo
from pymongo.collection
import Collection
class
Connect_mongo
(object):
def
__init__
(self): self.client = pymongo.MongoClient(host=
"192.168.66.100",port=
27017)
self.db_data = self.client[
"dou_guo_mei_shi"]
def
insert_item
(self,item): db_collection = Collection(self.db_data,
'dou_guo_mei_shi_item')
db_collection.insert(item)
# 暴露出来 mongo_info = Connect_mongo()
python爬取的数据通过mongo的工具保存到centos7的docker镜像中


#!/usr/bin/env python
# -*- coding: utf-8 -*-
# @Time : 2019/1/9 11:06
# @Author : lm
# @Url : idig8.com
# @Site :
# @File : spider_douguomeishi.py
# @Software: PyCharm import json
import requests
from multiprocessing import Queue
from handle_mongo import mongo_info
#创建队列 queue_list = Queue()
def handle_request(url,data):
header ={
"client":
"4",
"version":
"6916.2",
"device":
"SM-G955N",
"sdk":
"22,5.1.1",
"imei":
"354730010002552",
"channel":
"zhuzhan",
"mac":
"00:FF:E2:A2:7B:58",
"resolution":
"1440*900",
"dpi":
"2.0",
"android-id":
"bcdaf527105cc26f",
"pseudo-id":
"354730010002552",
"brand":
"samsung",
"scale":
"2.0",
"timezone":
"28800",
"language":
"zh",
"cns":
"3",
"carrier":
"Android",
#"imsi": "310260000000000",
"user-agent":
"Mozilla/5.0 (Linux; Android 5.1.1; SM-G955N Build/NRD90M) AppleWebKit/537.36 (KHTML, like Gecko) Version/4.0 Chrome/39.0.0.0 Mobile Safari/537.36",
"lon":
"105.566938",
"lat":
"29.99831",
"cid":
"512000",
"Content-Type":
"application/x-www-form-urlencoded; charset=utf-8",
"Accept-Encoding":
"gzip, deflate",
"Connection":
"Keep-Alive",
# "Cookie": "duid=58349118",
"Host":
"api.douguo.net",
#"Content-Length": "65" }
response = requests.post(url=url,headers=header,data=data)
return response
def handle_index():
url =
"http://api.douguo.net/recipe/flatcatalogs"
# client=4&_session=1547000257341354730010002552&v=1503650468&_vs=0 data ={
"client":
"4",
"_session":
"1547000257341354730010002552",
"v":
"1503650468",
"_vs":
"0" }
response = handle_request(url,data)
# print(response.text) index_response_dic = json.loads(response.text)
for item_index
in index_response_dic[
"result"][
"cs"]:
# print(item_index)
for item_index_cs
in item_index[
"cs"]:
# print(item_index_cs)
for item
in item_index_cs[
"cs"]:
#print(item) data_2 ={
"client":
"4",
#"_session":"1547000257341354730010002552",
"keyword":item[
"name"],
"_vs ":
"400",
"order":
"0" }
#print(data_2) queue_list.put(data_2)
def handle_caipu_list(data):
print(
"当前的食材:",data[
"keyword"])
caipu_list_url =
"http://api.douguo.net/recipe/s/0/20";
caipu_response = handle_request(caipu_list_url, data)
caipu_response_dict = json.loads(caipu_response.text)
for caipu_item
in caipu_response_dict[
"result"][
"list"]:
caipu_info ={}
caipu_info[
"shicai"] = data[
"keyword"]
if caipu_item[
"type"]==13:
caipu_info[
"user_name"] = caipu_item[
"r"][
"an"]
caipu_info[
"shicai_id"] = caipu_item[
"r"][
"id"]
caipu_info[
"describe"] = caipu_item[
"r"][
"cookstory"].replace(
"\n",
"").replace(
" ",
"")
caipu_info[
"caipu_name"] = caipu_item[
"r"][
"n"]
caipu_info[
"zuoliao_list"] = caipu_item[
"r"][
"major"]
#print(caipu_info) detail_url =
"http://api.douguo.net/recipe/detail/"+ str(caipu_info[
"shicai_id"])
detail_data ={
"client":
"4",
"_session":
"1547000257341354730010002552",
"author_id":
"0",
"_vs":
"2803",
"ext":
'{"query": {"kw": "'+data[
"keyword"]+
'", "src": "2803", "idx": "1", "type": "13", "id": '+str(caipu_info[
"shicai_id"])+
'}}' }
detail_reponse = handle_request(detail_url,detail_data)
detail_reponse_dic = json.loads(detail_reponse.text)
caipu_info[
"tips"] = detail_reponse_dic[
"result"][
"recipe"][
"tips"]
caipu_info[
"cookstep"] = detail_reponse_dic[
"result"][
"recipe"][
"cookstep"]
#print(json.dumps(caipu_info)) mongo_info.insert_item(caipu_info)
else:
continue handle_index()
handle_caipu_list(queue_list.get())

通过python多线程-线程池抓取
- python3通过concurrent.futures import ThreadPoolExecutor
引用线程池

#!/usr/bin/env python
回帖列表
1/
0