urllib爬虫练习

练习一:爬取图片

import urllib.request

response = urllib.request.urlopen('http://p3.so.qhmsg.com/t0113deb127018e7658.jpg')
with open('get.jpg','wb') as f:
        f.write(response.read())

response.read()的类型为bytes
图片是以二进制数据构成的,因此打开一个名为‘get.jpg’的文件,以二进制的方式写入图片数据,即可将图片保存下来

练习二:实现有道云翻译

进入官网后,输入一些内容,然后点翻译,内容就被提交了

右键-审查元素-网络

选中POST,在‘消息头’中可以看到请求的网址;在‘参数’可以看到data的信息,其中‘i’为输入的需要翻译的语句,则只需修改‘i’的值即可实现输入翻译的功能

URL中translate后面的‘_o’要去掉

import urllib.request
import urllib.parse
import json

message = '1'

while message != '0':
        message = input('请输入要翻译的话: ')
        url = 'http://fanyi.youdao.com/translate?smartresult=dict&smartresult=rule'
        data = {}
        data['action'] = 'FY_BY_CLICKBUTTION'
        data['client'] = 'fanyideskweb'
        data['doctype'] = 'json'
        data['from'] = 'AUTO'
        data['i'] = message
        data['keyfrom'] = 'fanyi.web'
        data['salt'] = '1533128754526'
        data['sign'] =     '493bcf524b7ac8716e160251ec2cf9f1'
        data['smartresult'] = 'dict'
        data['to'] = 'AUTO'
        data['typoResult'] = 'false'
        data['version'] = '2.1'

        data = urllib.parse.urlencode(data).encode('utf-8')

        response = urllib.request.urlopen(url,data)

        final = response.read().decode('utf-8')

        finall = json.loads(final)

        print('翻译结果为: %s' % finall['translateResult'][0][0]['tgt'])

json

在python中,字典的输出内容跟json格式内容一样,但是字典的格式是字典,json的格式是字符串,所以在传输的时候(特别是网页)要转换使用。

本质上来讲,字典是一种数据结构,json是一种格式;字典有很多内置函数,有多种调用方法,而json是数据打包的一种格式,并不像字典具备操作性,并且是格式就会有一些形式上的限制,比如json的格式要求必须且只能使用双引号作为key或者值的边界符号,不能使用单引号,而且“key”必须使用边界符(双引号),但字典就无所谓了。

练习三:实现有道云翻译-添加User-Agent

1.通过headers参数直接构造

import urllib.request
import urllib.parse
import json

message = '1'

while message != '0':
        message = input('请输入要翻译的话: ')
        url = 'http://fanyi.youdao.com/translate?smartresult=dict&smartresult=rule'
        head = {}
        head['User-Agent'] = 'Mozilla/5.0 (Windows NT 10.0; WOW64; rv:61.0) Gecko/20100101 Firefox/61.0'

        data = {}
        data['action'] = 'FY_BY_CLICKBUTTION'
        data['client'] = 'fanyideskweb'
        data['doctype'] = 'json'
        data['from'] = 'AUTO'
        data['i'] = message
        data['keyfrom'] = 'fanyi.web'
        data['salt'] = '1533128754526'
        data['sign'] =     '493bcf524b7ac8716e160251ec2cf9f1'
        data['smartresult'] = 'dict'
        data['to'] = 'AUTO'
        data['typoResult'] = 'false'
        data['version'] = '2.1'

        data = urllib.parse.urlencode(data).encode('utf-8')

        req = urllib.request.Request(url,data,head)
        response = urllib.request.urlopen(req)

        final = response.read().decode('utf-8')

        finall = json.loads(final)

        print('翻译结果为: %s' % finall['translateResult'][0][0]['tgt'])

2.调用请求实例的add_header()方法添加;该方法有两个参数,第一个是key,第二个是value

import urllib.request
import urllib.parse
import json

message = '1'

while message != '0':
    message = input('请输入要翻译的话: ')
    url = 'http://fanyi.youdao.com/translate?smartresult=dict&smartresult=rule'

    data = {}
    data['action'] = 'FY_BY_CLICKBUTTION'
    data['client'] = 'fanyideskweb'
    data['doctype'] = 'json'
    data['from'] = 'AUTO'
    data['i'] = message
    data['keyfrom'] = 'fanyi.web'
    data['salt'] = '1533128754526'
    data['sign'] =     '493bcf524b7ac8716e160251ec2cf9f1'
    data['smartresult'] = 'dict'
    data['to'] = 'AUTO'
    data['typoResult'] = 'false'
    data['version'] = '2.1'

    data = urllib.parse.urlencode(data).encode('utf-8')

    req = urllib.request.Request(url,data)
    req.add_header('User-Agent','Mozilla/5.0 (Windows NT 10.0; WOW64; rv:61.0) Gecko/20100101 Firefox/61.0')

    response = urllib.request.urlopen(req)

    final = response.read().decode('utf-8')

    fi = json.loads(final)

    print('翻译结果为: %s' % fi['translateResult'][0][0]['tgt'])

练习四:爬取豆瓣一个页面的图片

import urllib.request
import os

#创建一个文件夹用于存放照片
os.mkdir('photos')
os.chdir('photos')

#得到网页源代码
url = 'https://movie.douban.com/chart'
req = urllib.request.Request(url)
req.add_header('User-Agent','Mozilla/5.0 (Windows NT 10.0; WOW64; rv:61.0) Gecko/20100101 Firefox/61.0')
response = urllib.request.urlopen(req)
html = response.read().decode('utf-8')

#查找网页中照片的网址

a = html.find('img src=')
pholist = []

while a != -1:
        b = html.find('.jpg',a,a+200)

        if b != -1:
                pholist.append(html[a+9:b+4])

        else:
                b = a+9

        a = html.find('img src=',b)

#保存照片文件
num = 1
for each in pholist:

        re = urllib.request.urlopen(each)

        with open('%d.jpg'%num,'wb') as f:
                f.write(re.read())

        num += 1