「docker实战篇」python的docker爬虫技术-python脚本app抓取(13)
2019-05-19 22:25:54
李明
  • 访问次数: 377
  • 注册日期: 2018-07-09
  • 最后登录: 2019-10-30

原创文章,欢迎转载。转载请注明:转载自 IT人故事会,谢谢!
原文链接地址: 「docker实战篇」python的docker爬虫技术-python脚本app抓取(13)

上次已经分析出来具体的app的请求连接了,本次主要说说python的开发,抓取APP里面的信息。源码: https://github.com/limingios/dockerpython.git

1240

分析app数据包

查看分析

1240

解析出来的header

1240

夜神配置

1240

python代码,爬取分类

1240
                      #!/usr/bin/env python 
                      # -*- coding: utf-8 -*- 
                      # @Time    : 2019/1/9 11:06 
                      # @Author  : lm 
                      # @Url     : idig8.com 
                      # @Site    :  
                      # @File    : spider_douguomeishi.py 
                      # @Software: PyCharm 
                      import requests 
                      #header内容比较多,因为各个厂家的思路不同, 
                      #fiddler爬取出来的字段比较多,有些内容应该是非必填的,只能在实际的时候尝试注释一些来试。 
                      
                      def 
                      handle_request
                      (url,data): header ={ 
                      "client": 
                      "4", 
                      "version": 
                      "6916.2", 
                      "device": 
                      "SM-G955N", 
                      "sdk": 
                      "22,5.1.1", 
                      "imei": 
                      "354730010002552", 
                      "channel": 
                      "zhuzhan", 
                      "mac": 
                      "00:FF:E2:A2:7B:58", 
                      "resolution": 
                      "1440*900", 
                      "dpi":
                      "2.0", 
                      "android-id":
                      "bcdaf527105cc26f", 
                      "pseudo-id":
                      "354730010002552", 
                      "brand":
                      "samsung", 
                      "scale":
                      "2.0", 
                      "timezone":
                      "28800", 
                      "language":
                      "zh", 
                      "cns":
                      "3", 
                      "carrier": 
                      "Android", 
                      #"imsi": "310260000000000", 
                      "user-agent": 
                      "Mozilla/5.0 (Linux; Android 5.1.1; SM-G955N Build/NRD90M) AppleWebKit/537.36 (KHTML, like Gecko) Version/4.0 Chrome/39.0.0.0 Mobile Safari/537.36", 
                      "lon": 
                      "105.566938", 
                      "lat": 
                      "29.99831", 
                      "cid": 
                      "512000", 
                      "Content-Type": 
                      "application/x-www-form-urlencoded; charset=utf-8", 
                      "Accept-Encoding": 
                      "gzip, deflate", 
                      "Connection": 
                      "Keep-Alive", 
                      # "Cookie": "duid=58349118", 
                      "Host": 
                      "api.douguo.net", 
                      #"Content-Length": "65" }
    response = requests.post(url=url,headers=header,data=data) 
                      return response 
                      
                      def 
                      handle_index
                      (): url = 
                      "http://api.douguo.net/recipe/flatcatalogs" 
                      # client=4&_session=1547000257341354730010002552&v=1503650468&_vs=0 data ={ 
                      "client":
                      "4", 
                      "_session":
                      "1547000257341354730010002552", 
                      "v":
                      "1503650468", 
                      "_vs":
                      "0" }
    response = handle_request(url,data)
    print(response.text)
handle_index()
1240

爬取详情,信息通过分类找到里面的详情

1240
1240
                      #!/usr/bin/env python 
                      # -*- coding: utf-8 -*- 
                      # @Time    : 2019/1/9 11:06 
                      # @Author  : lm 
                      # @Url     : idig8.com 
                      # @Site    :  
                      # @File    : spider_douguomeishi.py 
                      # @Software: PyCharm 
                      import json 
                      import requests 
                      from multiprocessing 
                      import Queue 
                      #创建队列 queue_list = Queue() 
                      
                      def 
                      handle_request
                      (url,data): header ={ 
                      "client": 
                      "4", 
                      "version": 
                      "6916.2", 
                      "device": 
                      "SM-G955N", 
                      "sdk": 
                      "22,5.1.1", 
                      "imei": 
                      "354730010002552", 
                      "channel": 
                      "zhuzhan", 
                      "mac": 
                      "00:FF:E2:A2:7B:58", 
                      "resolution": 
                      "1440*900", 
                      "dpi":
                      "2.0", 
                      "android-id":
                      "bcdaf527105cc26f", 
                      "pseudo-id":
                      "354730010002552", 
                      "brand":
                      "samsung", 
                      "scale":
                      "2.0", 
                      "timezone":
                      "28800", 
                      "language":
                      "zh", 
                      "cns":
                      "3", 
                      "carrier": 
                      "Android", 
                      #"imsi": "310260000000000", 
                      "user-agent": 
                      "Mozilla/5.0 (Linux; Android 5.1.1; SM-G955N Build/NRD90M) AppleWebKit/537.36 (KHTML, like Gecko) Version/4.0 Chrome/39.0.0.0 Mobile Safari/537.36", 
                      "lon": 
                      "105.566938", 
                      "lat": 
                      "29.99831", 
                      "cid": 
                      "512000", 
                      "Content-Type": 
                      "application/x-www-form-urlencoded; charset=utf-8", 
                      "Accept-Encoding": 
                      "gzip, deflate", 
                      "Connection": 
                      "Keep-Alive", 
                      # "Cookie": "duid=58349118", 
                      "Host": 
                      "api.douguo.net", 
                      #"Content-Length": "65" }
    response = requests.post(url=url,headers=header,data=data) 
                      return response 
                      
                      def 
                      handle_index
                      (): url = 
                      "http://api.douguo.net/recipe/flatcatalogs" 
                      # client=4&_session=1547000257341354730010002552&v=1503650468&_vs=0 data ={ 
                      "client":
                      "4", 
                      "_session":
                      "1547000257341354730010002552", 
                      "v":
                      "1503650468", 
                      "_vs":
                      "0" }
    response = handle_request(url,data) 
                      # print(response.text) index_response_dic = json.loads(response.text) 
                      for item_index 
                      in index_response_dic[
                      "result"][
                      "cs"]: 
                      # print(item_index) 
                      for item_index_cs 
                      in item_index[
                      "cs"]: 
                      # print(item_index_cs) 
                      for item 
                      in item_index_cs[
                      "cs"]: 
                      #print(item) data_2 ={ 
                      "client":
                      "4", 
                      "_session":
                      "1547000257341354730010002552", 
                      "keyword":item[
                      "name"], 
                      "_vs ":
                      "400" } 
                      #print(data_2) queue_list.put(data_2)
handle_index()
print(queue_list.qsize())
image.png
image.png

分类菜谱内部的详情信息

                      #!/usr/bin/env python 
                      # -*- coding: utf-8 -*- 
                      # @Time    : 2019/1/9 11:06 
                      # @Author  : lm 
                      # @Url     : idig8.com 
                      # @Site    :  
                      # @File    : spider_douguomeishi.py 
                      # @Software: PyCharm 
                      import json 
                      import requests 
                      from multiprocessing 
                      import Queue 
                      #创建队列 queue_list = Queue() 
                      
                      def 
                      handle_request
                      (url,data): header ={ 
                      "client": 
                      "4", 
                      "version": 
                      "6916.2", 
                      "device": 
                      "SM-G955N", 
                      "sdk": 
                      "22,5.1.1", 
                      "imei": 
                      "354730010002552", 
                      "channel": 
                      "zhuzhan", 
                      "mac": 
                      "00:FF:E2:A2:7B:58", 
                      "resolution": 
                      "1440*900", 
                      "dpi":
                      "2.0", 
                      "android-id":
                      "bcdaf527105cc26f", 
                      "pseudo-id":
                      "354730010002552", 
                      "brand":
                      "samsung", 
                      "scale":
                      "2.0", 
                      "timezone":
                      "28800", 
                      "language":
                      "zh", 
                      "cns":
                      "3", 
                      "carrier": 
                      "Android", 
                      #"imsi": "310260000000000", 
                      "user-agent": 
                      "Mozilla/5.0 (Linux; Android 5.1.1; SM-G955N Build/NRD90M) AppleWebKit/537.36 (KHTML, like Gecko) Version/4.0 Chrome/39.0.0.0 Mobile Safari/537.36", 
                      "lon": 
                      "105.566938", 
                      "lat": 
                      "29.99831", 
                      "cid": 
                      "512000", 
                      "Content-Type": 
                      "application/x-www-form-urlencoded; charset=utf-8", 
                      "Accept-Encoding": 
                      "gzip, deflate", 
                      "Connection": 
                      "Keep-Alive", 
                      # "Cookie": "duid=58349118", 
                      "Host": 
                      "api.douguo.net", 
                      #"Content-Length": "65" }
    response = requests.post(url=url,headers=header,data=data) 
                      return response 
                      
                      def 
                      handle_index
                      (): url = 
                      "http://api.douguo.net/recipe/flatcatalogs" 
                      # client=4&_session=1547000257341354730010002552&v=1503650468&_vs=0 data ={ 
                      "client":
                      "4", 
                      "_session":
                      "1547000257341354730010002552", 
                      "v":
                      "1503650468", 
                      "_vs":
                      "0" }
    response = handle_request(url,data) 
                      # print(response.text) index_response_dic = json.loads(response.text) 
                      for item_index 
                      in index_response_dic[
                      "result"][
                      "cs"]: 
                      # print(item_index) 
                      for item_index_cs 
                      in item_index[
                      "cs"]: 
                      # print(item_index_cs) 
                      for item 
                      in item_index_cs[
                      "cs"]: 
                      #print(item) data_2 ={ 
                      "client":
                      "4", 
                      #"_session":"1547000257341354730010002552", 
                      "keyword":item[
                      "name"], 
                      "_vs ":
                      "400", 
                      "order":
                      "0" } 
                      #print(data_2) queue_list.put(data_2) 
                      
                      def 
                      handle_caipu_list
                      (data): print(
                      "当前的食材:",data[
                      "keyword"])
    caipu_list_url = 
                      "http://api.douguo.net/recipe/s/0/20";
    caipu_response = handle_request(caipu_list_url, data)
    caipu_response_dict = json.loads(caipu_response.text) 
                      for caipu_item 
                      in caipu_response_dict[
                      "result"][
                      "list"]:
        caipu_info ={}
        caipu_info[
                      "shicai"] = data[
                      "keyword"] 
                      if caipu_item[
                      "type"]==
                      13:
            caipu_info[
                      "user_name"] = caipu_item[
                      "r"][
                      "an"]
            caipu_info[
                      "shicai_id"] = caipu_item[
                      "r"][
                      "id"]
            caipu_info[
                      "describe"] = caipu_item[
                      "r"][
                      "cookstory"].replace(
                      "\n",
                      "").replace(
                      " ",
                      "")
            caipu_info[
                      "caipu_name"] = caipu_item[
                      "r"][
                      "n"]
            caipu_info[
                      "zuoliao_list"] = caipu_item[
                      "r"][
                      "major"]
            print(caipu_info) 
                      else: 
                      continue handle_index()
handle_caipu_list(queue_list.get())
1240

菜品内部的详情信息

1240
1240
                      #!/usr/bin/env python 
                      # -*- coding: utf-8 -*- 
                      # @Time    : 2019/1/9 11:06 
                      # @Author  : lm 
                      # @Url     : idig8.com 
                      # @Site    :  
                      # @File    : spider_douguomeishi.py 
                      # @Software: PyCharm 
                      import json 
                      import requests 
                      from multiprocessing 
                      import Queue 
                      #创建队列 queue_list = Queue() 
                      
                      def 
                      handle_request
                      (url,data): header ={ 
                      "client": 
                      "4", 
                      "version": 
                      "6916.2", 
                      "device": 
                      "SM-G955N", 
                      "sdk": 
                      "22,5.1.1", 
                      "imei": 
                      "354730010002552", 
                      "channel": 
                      "zhuzhan", 
                      "mac": 
                      "00:FF:E2:A2:7B:58", 
                      "resolution": 
                      "1440*900", 
                      "dpi":
                      "2.0", 
                      "android-id":
                      "bcdaf527105cc26f", 
                      "pseudo-id":
                      "354730010002552", 
                      "brand":
                      "samsung", 
                      "scale":
                      "2.0", 
                      "timezone":
                      "28800", 
                      "language":
                      "zh", 
                      "cns":
                      "3", 
                      "carrier": 
                      "Android", 
                      #"imsi": "310260000000000", 
                      "user-agent": 
                      "Mozilla/5.0 (Linux; Android 5.1.1; SM-G955N Build/NRD90M) AppleWebKit/537.36 (KHTML, like Gecko) Version/4.0 Chrome/39.0.0.0 Mobile Safari/537.36", 
                      "lon": 
                      "105.566938", 
                      "lat": 
                      "29.99831", 
                      "cid": 
                      "512000", 
                      "Content-Type": 
                      "application/x-www-form-urlencoded; charset=utf-8", 
                      "Accept-Encoding": 
                      "gzip, deflate", 
                      "Connection": 
                      "Keep-Alive", 
                      # "Cookie": "duid=58349118", 
                      "Host": 
                      "api.douguo.net", 
                      #"Content-Length": "65" }
    response = requests.post(url=url,headers=header,data=data) 
                      return response 
                      
                      def 
                      handle_index
                      (): url = 
                      "http://api.douguo.net/recipe/flatcatalogs" 
                      # client=4&_session=1547000257341354730010002552&v=1503650468&_vs=0 data ={ 
                      "client":
                      "4", 
                      "_session":
                      "1547000257341354730010002552", 
                      "v":
                      "1503650468", 
                      "_vs":
                      "0" }
    response = handle_request(url,data) 
                      # print(response.text) index_response_dic = json.loads(response.text) 
                      for item_index 
                      in index_response_dic[
                      "result"][
                      "cs"]: 
                      # print(item_index) 
                      for item_index_cs 
                      in item_index[
                      "cs"]: 
                      # print(item_index_cs) 
                      for item 
                      in item_index_cs[
                      "cs"]: 
                      #print(item) data_2 ={ 
                      "client":
                      "4", 
                      #"_session":"1547000257341354730010002552", 
                      "keyword":item[
                      "name"], 
                      "_vs ":
                      "400", 
                      "order":
                      "0" } 
                      #print(data_2) queue_list.put(data_2) 
                      
                      def 
                      handle_caipu_list
                      (data): print(
                      "当前的食材:",data[
                      "keyword"])
    caipu_list_url = 
                      "http://api.douguo.net/recipe/s/0/20";
    caipu_response = handle_request(caipu_list_url, data)
    caipu_response_dict = json.loads(caipu_response.text) 
                      for caipu_item 
                      in caipu_response_dict[
                      "result"][
                      "list"]:
        caipu_info ={}
        caipu_info[
                      "shicai"] = data[
                      "keyword"] 
                      if caipu_item[
                      "type"]==
                      13:
            caipu_info[
                      "user_name"] = caipu_item[
                      "r"][
                      "an"]
            caipu_info[
                      "shicai_id"] = caipu_item[
                      "r"][
                      "id"]
            caipu_info[
                      "describe"] = caipu_item[
                      "r"][
                      "cookstory"].replace(
                      "\n",
                      "").replace(
                      " ",
                      "")
            caipu_info[
                      "caipu_name"] = caipu_item[
                      "r"][
                      "n"]
            caipu_info[
                      "zuoliao_list"] = caipu_item[
                      "r"][
                      "major"] 
                      #print(caipu_info) detail_url = 
                      "http://api.douguo.net/recipe/detail/"+ str(caipu_info[
                      "shicai_id"])
            detail_data ={ 
                      "client":
                      "4", 
                      "_session":
                      "1547000257341354730010002552", 
                      "author_id":
                      "0", 
                      "_vs":
                      "2803", 
                      "ext":
                      '{"query": {"kw": "'+data[
                      "keyword"]+
                      '", "src": "2803", "idx": "1", "type": "13", "id": '+str(caipu_info[
                      "shicai_id"])+
                      '}}' }
            detail_reponse = handle_request(detail_url,detail_data)
            detail_reponse_dic = json.loads(detail_reponse.text)
            caipu_info[
                      "tips"] = detail_reponse_dic[
                      "result"][
                      "recipe"][
                      "tips"]
            caipu_info[
                      "cookstep"] = detail_reponse_dic[
                      "result"][
                      "recipe"][
                      "cookstep"]
            print(json.dumps(caipu_info)) 
                      else: 
                      continue handle_index()
handle_caipu_list(queue_list.get())

将数据保存在mongodb中

  • 通过vagrant 安装虚拟机
vagrant up
  • 进入虚拟机

ip 192.168.66.100

1240
su - 
                      #密码:vagrant docker
1240
  • 拉取mongodb的镜像

https://hub.docker.com/r/bitnami/mongodb
默认端口:27017

docker pull bitnami/mongodb:latest
1240
  • 创建mongodb的容器
mkdir bitnami 
                      cd bitnami
mkdir mongodb
docker run -d -v /path/to/mongodb-persistence:/root/bitnami -p 27017:27017 bitnami/mongodb:latest 
                      #关闭防火墙 systemctl stop firewalld

用第三方工具连接

1240

连接mongodb的工具

                      #!/usr/bin/env python 
                      # -*- coding: utf-8 -*- 
                      # @Time    : 2019/1/11 0:53 
                      # @Author  :  liming 
                      # @Site    :  
                      # @File    : handle_mongodb.py 
                      # @url    : idig8.com 
                      # @Software: PyCharm 
                      import pymongo 
                      from pymongo.collection 
                      import Collection 
                      
                      class 
                      Connect_mongo
                      (object): 
                      
                      def 
                      __init__
                      (self): self.client = pymongo.MongoClient(host=
                      "192.168.66.100",port=
                      27017)
        self.db_data = self.client[
                      "dou_guo_mei_shi"] 
                      
                      def 
                      insert_item
                      (self,item): db_collection = Collection(self.db_data,
                      'dou_guo_mei_shi_item')
        db_collection.insert(item) 
                      # 暴露出来 mongo_info = Connect_mongo()

python爬取的数据通过mongo的工具保存到centos7的docker镜像中

1240
1240
                      #!/usr/bin/env python 
                      # -*- coding: utf-8 -*- 
                      # @Time    : 2019/1/9 11:06 
                      # @Author  : lm 
                      # @Url     : idig8.com 
                      # @Site    :  
                      # @File    : spider_douguomeishi.py 
                      # @Software: PyCharm import json
import requests
from multiprocessing import Queue
from handle_mongo import mongo_info 
                      #创建队列 queue_list = Queue()
def handle_request(url,data):
    header ={ 
                      "client": 
                      "4", 
                      "version": 
                      "6916.2", 
                      "device": 
                      "SM-G955N", 
                      "sdk": 
                      "22,5.1.1", 
                      "imei": 
                      "354730010002552", 
                      "channel": 
                      "zhuzhan", 
                      "mac": 
                      "00:FF:E2:A2:7B:58", 
                      "resolution": 
                      "1440*900", 
                      "dpi":
                      "2.0", 
                      "android-id":
                      "bcdaf527105cc26f", 
                      "pseudo-id":
                      "354730010002552", 
                      "brand":
                      "samsung", 
                      "scale":
                      "2.0", 
                      "timezone":
                      "28800", 
                      "language":
                      "zh", 
                      "cns":
                      "3", 
                      "carrier": 
                      "Android", 
                      #"imsi": "310260000000000", 
                      "user-agent": 
                      "Mozilla/5.0 (Linux; Android 5.1.1; SM-G955N Build/NRD90M) AppleWebKit/537.36 (KHTML, like Gecko) Version/4.0 Chrome/39.0.0.0 Mobile Safari/537.36", 
                      "lon": 
                      "105.566938", 
                      "lat": 
                      "29.99831", 
                      "cid": 
                      "512000", 
                      "Content-Type": 
                      "application/x-www-form-urlencoded; charset=utf-8", 
                      "Accept-Encoding": 
                      "gzip, deflate", 
                      "Connection": 
                      "Keep-Alive", 
                      # "Cookie": "duid=58349118", 
                      "Host": 
                      "api.douguo.net", 
                      #"Content-Length": "65" }
    response = requests.post(url=url,headers=header,data=data) 
                      return response
def handle_index():
    url = 
                      "http://api.douguo.net/recipe/flatcatalogs" 
                      # client=4&_session=1547000257341354730010002552&v=1503650468&_vs=0 data ={ 
                      "client":
                      "4", 
                      "_session":
                      "1547000257341354730010002552", 
                      "v":
                      "1503650468", 
                      "_vs":
                      "0" }
    response = handle_request(url,data) 
                      # print(response.text) index_response_dic = json.loads(response.text) 
                      for item_index 
                      in index_response_dic[
                      "result"][
                      "cs"]: 
                      # print(item_index) 
                      for item_index_cs 
                      in item_index[
                      "cs"]: 
                      # print(item_index_cs) 
                      for item 
                      in item_index_cs[
                      "cs"]: 
                      #print(item) data_2 ={ 
                      "client":
                      "4", 
                      #"_session":"1547000257341354730010002552", 
                      "keyword":item[
                      "name"], 
                      "_vs ":
                      "400", 
                      "order":
                      "0" } 
                      #print(data_2) queue_list.put(data_2)
def handle_caipu_list(data): 
                      print(
                      "当前的食材:",data[
                      "keyword"])
    caipu_list_url = 
                      "http://api.douguo.net/recipe/s/0/20";
    caipu_response = handle_request(caipu_list_url, data)
    caipu_response_dict = json.loads(caipu_response.text) 
                      for caipu_item 
                      in caipu_response_dict[
                      "result"][
                      "list"]:
        caipu_info ={}
        caipu_info[
                      "shicai"] = data[
                      "keyword"] 
                      if caipu_item[
                      "type"]==13:
            caipu_info[
                      "user_name"] = caipu_item[
                      "r"][
                      "an"]
            caipu_info[
                      "shicai_id"] = caipu_item[
                      "r"][
                      "id"]
            caipu_info[
                      "describe"] = caipu_item[
                      "r"][
                      "cookstory"].replace(
                      "\n",
                      "").replace(
                      " ",
                      "")
            caipu_info[
                      "caipu_name"] = caipu_item[
                      "r"][
                      "n"]
            caipu_info[
                      "zuoliao_list"] = caipu_item[
                      "r"][
                      "major"] 
                      #print(caipu_info) detail_url = 
                      "http://api.douguo.net/recipe/detail/"+ str(caipu_info[
                      "shicai_id"])
            detail_data ={ 
                      "client":
                      "4", 
                      "_session":
                      "1547000257341354730010002552", 
                      "author_id":
                      "0", 
                      "_vs":
                      "2803", 
                      "ext":
                      '{"query": {"kw": "'+data[
                      "keyword"]+
                      '", "src": "2803", "idx": "1", "type": "13", "id": '+str(caipu_info[
                      "shicai_id"])+
                      '}}' }
            detail_reponse = handle_request(detail_url,detail_data)
            detail_reponse_dic = json.loads(detail_reponse.text)
            caipu_info[
                      "tips"] = detail_reponse_dic[
                      "result"][
                      "recipe"][
                      "tips"]
            caipu_info[
                      "cookstep"] = detail_reponse_dic[
                      "result"][
                      "recipe"][
                      "cookstep"] 
                      #print(json.dumps(caipu_info)) mongo_info.insert_item(caipu_info) 
                      else: 
                      continue handle_index()
handle_caipu_list(queue_list.get())
1240

通过python多线程-线程池抓取

  • python3通过concurrent.futures import ThreadPoolExecutor

引用线程池

1240
                      #!/usr/bin/env python 
                      
                 回贴