• 企业400电话
  • 微网小程序
  • AI电话机器人
  • 电商代运营
  • 全 部 栏 目

    企业400电话 网络优化推广 AI电话机器人 呼叫中心 网站建设 商标✡知产 微网小程序 电商运营 彩铃•短信 增值拓展业务
    python利用xpath爬取网上数据并存储到django模型中

    帮朋友制作一个网站,需要一些产品数据信息,因为是代理其他公司产品,直接爬取代理公司产品数据

    1.设计数据库

    from django.db import models
    from uuslug import slugify
    import uuid
    import os
    
    
    def products_directory_path(instance, filename):
      ext = filename.split('.')[-1]
      filename = '{}.{}'.format(uuid.uuid4().hex[:8], ext)
      # return the whole path to the file
      return os.path.join('images', "products", instance.title, filename)
    
    
    def product_relatedimage_directory_path(instance, filename):
      ext = filename.split('.')[-1]
      filename = '{}.{}'.format(uuid.uuid4().hex[:8], ext)
      # return the whole path to the file
      return os.path.join('images', "product_relatedimage", instance.product.title, filename)
    
    
    class ProductsCategory(models.Model):
      """产品分类"""
      name = models.CharField('产品分类名', max_length=80, unique=True)
      description = models.TextField('产品分类描述', blank=True, null=True)
      slug = models.SlugField('slug', max_length=80, blank=True, null=True)
      parent_category = models.ForeignKey('self', verbose_name="父级分类", blank=True, null=True, on_delete=models.CASCADE)
    
      def save(self, *args, **kwargs):
        if not self.id or not self.slug:
          self.slug = slugify(self.name)
        super().save(*args, **kwargs)
    
      def __str__(self):
        return self.name
    
      class Meta:
        ordering = ['name']
        verbose_name = "产品分类"
        verbose_name_plural = verbose_name
    
    
    class ProductsTag(models.Model):
      """产品标签"""
      name = models.CharField('产品标签名', max_length=30, unique=True)
      slug = models.SlugField('slug', max_length=40)
    
      def __str__(self):
        return self.name
    
      def save(self, *args, **kwargs):
        if not self.id or not self.slug:
          self.slug = slugify(self.name)
        super().save(*args, **kwargs)
    
      class Meta:
        ordering = ['name']
        verbose_name = "产品标签"
        verbose_name_plural = verbose_name
    
    
    class Product(models.Model):
      title = models.CharField('标题', max_length=255, unique=True)
      slug = models.SlugField('slug', max_length=255, blank=True, null=True)
      jscs = models.TextField('技术参数', blank=True, null=True)
      image = models.ImageField(upload_to=products_directory_path, verbose_name="产品图片")
      views = models.PositiveIntegerField('浏览量', default=0)
      category = models.ForeignKey('ProductsCategory', verbose_name='分类', on_delete=models.CASCADE, blank=True, null=True)
      tags = models.ManyToManyField('ProductsTag', verbose_name='标签集合', blank=True)
    
      def save(self, *args, **kwargs):
        if not self.id or not self.slug:
          self.slug = slugify(self.title)
        super().save(*args, **kwargs)
    
      def update_views(self):
        self.views += 1
        self.save(update_fields=['views'])
    
      def get_pre(self):
        return Product.objects.filter(id__lt=self.id).order_by('-id').first()
    
      def get_next(self):
        return Product.objects.filter(id__gt=self.id).order_by('id').first()
    
      def __str__(self):
        return self.title
    
      class Meta:
        verbose_name = "产品"
        verbose_name_plural = verbose_name
    
    
    class ProductAdvantage(models.Model):
      content = models.TextField('产品优势', blank=True, null=True)
      product = models.ForeignKey(Product, on_delete=models.CASCADE, blank=True, null=True)
    
      def __str__(self):
        return self.content
    
      class Meta:
        verbose_name = "产品优势"
        verbose_name_plural = verbose_name
    
    
    class ProductBody(models.Model):
      body = models.CharField('产品内容', max_length=256, blank=True, null=True)
      product = models.ForeignKey(Product, on_delete=models.CASCADE, blank=True, null=True)
    
      def __str__(self):
        return self.product.title
    
      class Meta:
        verbose_name = "产品内容"
        verbose_name_plural = verbose_name

    2.脚本编写

    2.1编写获取网页源代码函数

    def get_one_page(url):
      try:
        headers = {
          "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.103 Safari/537.36"}
        res = requests.get(url=url, headers=headers)
        res.encoding = 'utf-8'
        if res.status_code == 200:
          return res.text
        else:
          return None
      except Exception:
        return None

    2.2根据base页面获取所有产品分类页面链接

    if __name__ == '__main__':
      content = get_one_page(url)
      tree = etree.HTML(content)
      # 产品分类url
      catgory_urls = tree.xpath('//div[@class="fdh-01-nav"]/div/h3/a/@href')
      # 处理catgory_urls
      for url in catgory_urls:
        url = 'http://www.kexinjianji.com' + url
        print(url)

    2.3根据产品分类页面链接获取对应所有产品链接

    if __name__ == '__main__':
      content = get_one_page(url)
      tree = etree.HTML(content)
      # 产品分类
      catgory = tree.xpath('//div[@class="cplb-3n-ts-03 b"]/h3/span/text()')
      print("产品分类:" + catgory[0])
      # 该分类下产品url
      urls = tree.xpath('//div[@class="cplb-3n-ts-03-list"]/dl/dt/a/@href')
      # 处理url
      for url in urls:
        url = 'http://www.kexinjianji.com' + url
        print(url)
      print("=====================================================")

    两者结合起来就可以打印出所有产品链接

    if __name__ == '__main__':
      content = get_one_page(url)
      tree = etree.HTML(content)
      # 产品分类url
      catgory_urls = tree.xpath('//div[@class="fdh-01-nav"]/div/h3/a/@href')
      # 处理catgory_urls
      for url in catgory_urls:
        url = 'http://www.kexinjianji.com' + url
        content = get_one_page(url)
        tree = etree.HTML(content)
        # 产品分类
        catgory = tree.xpath('//div[@class="cplb-3n-ts-03 b"]/h3/span/text()')
        print("产品分类:" + catgory[0])
        # 该分类下产品url
        urls = tree.xpath('//div[@class="cplb-3n-ts-03-list"]/dl/dt/a/@href')
        # 处理url
        for url in urls:
          url = 'http://www.kexinjianji.com' + url
          print(url)
        print("=====================================================")

    2.2使用xpath解析函数返回产品链接的内容

    if __name__ == '__main__':
      content = get_one_page(url)
      tree = etree.HTML(content)
      # 产品名称
      title = tree.xpath('//*[@id="wrap"]//h1/text()')
      images = tree.xpath('//div[@class="sol_tj_left"]/a/img/@src')
      # 产品图片
      images_url = 'http://www.kexinjianji.com/' + images[0]
      # 性能特点
      xntd = tree.xpath('//div[@class="w"]//div/span/text()|//div[@class="w"]//div/text()')
      # 技术参数
      jscs = tree.xpath('//table')[0]
      jscs_str = etree.tostring(jscs, encoding='utf-8').decode('utf-8')
      # 产品内容
      cpnr = tree.xpath('//div[@class="describe"]/p')
      print('产品名称:' + title[0])
      print('产品图片:' + images_url)
      for td in xntd:
        print('性能特点:' + td)
      print('技术参数:' + jscs_str)
      for cp in cpnr:
        # string(.) 获取当前标签下所有文本内容
        cp = cp.xpath('string(.)')
        print('产品内容:' + cp)
      print('============================================')

    将三者结合在一起就可以获取所有产品信息

    if __name__ == '__main__':
      content = get_one_page(url)
      tree = etree.HTML(content)
      # 产品分类url
      catgory_urls = tree.xpath('//div[@class="fdh-01-nav"]/div/h3/a/@href')
      # 处理catgory_urls
      for url in catgory_urls:
        url = 'http://www.kexinjianji.com' + url
        content = get_one_page(url)
        tree = etree.HTML(content)
        # 产品分类
        catgory = tree.xpath('//div[@class="cplb-3n-ts-03 b"]/h3/span/text()')
        # 该分类下产品url
        urls = tree.xpath('//div[@class="cplb-3n-ts-03-list"]/dl/dt/a/@href')
        # 处理url
        for url in urls:
          url = 'http://www.kexinjianji.com' + url
          content = get_one_page(url)
          try:
            tree = etree.HTML(content)
            # 产品名称
            title = tree.xpath('//*[@id="wrap"]//h1/text()')
            images = tree.xpath('//div[@class="sol_tj_left"]/a/img/@src')
            # 产品图片
            images_url = 'http://www.kexinjianji.com' + images[0]
            # 性能特点
            xntd = tree.xpath('//div[@class="w"]//div/span/text()|//div[@class="w"]//div/text()')
            # 技术参数
            jscs = tree.xpath('//table')[0]
            jscs_str = etree.tostring(jscs, encoding='utf-8').decode('utf-8')
            # 产品内容
            cpnr = tree.xpath('//div[@class="describe"]/p')
            print("产品分类:" + catgory[0])
            print('产品链接:' + url)
            print('产品名称:' + title[0])
            print('产品图片:' + images_url)
            for td in xntd:
              print('性能特点:' + td.strip())
            # print('技术参数:' + jscs_str)
            for cp in cpnr:
              # string(.) 获取当前标签下所有文本内容
              cp = cp.xpath('string(.)')
              print('产品内容:' + cp)
            print('============================================')
          except Exception as e:
            print(e)
            print('出错url:' + url)
            pass

    3.存储到django模型

    import requests
    from lxml.html import etree
    import os
    import django
    import uuid
    from django.core.files.base import ContentFile
    
    os.environ.setdefault("DJANGO_SETTINGS_MODULE", "jiaobanzhan.settings")
    django.setup()
    
    from products.models import ProductBody, ProductsCategory, Product, ProductAdvantage
    
    url = 'http://www.kexinjianji.com/product/hzshntjbz_1/'
    
    
    def get_one_page(url):
      try:
        headers = {
          "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.103 Safari/537.36"}
        res = requests.get(url=url, headers=headers, timeout=10)
        res.encoding = 'utf-8'
        if res.status_code == 200:
          return res.text
        else:
          return None
      except Exception:
        print('aa')
        return None
    
    
    if __name__ == '__main__':
      content = get_one_page(url)
      tree = etree.HTML(content)
      # 产品分类url
      catgory_urls = tree.xpath('//div[@class="fdh-01-nav"]/div/h3/a/@href')
      # 处理catgory_urls
      for url in catgory_urls:
        url = 'http://www.kexinjianji.com' + url
        content = get_one_page(url)
        tree = etree.HTML(content)
        # 产品分类
        p_catgory = tree.xpath('//div[@class="cplb-3n-ts-03 b"]/h3/span/text()')
        # 该分类下产品url
        urls = tree.xpath('//div[@class="cplb-3n-ts-03-list"]/dl/dt/a/@href')
        # 处理url
        for url in urls:
          url = 'http://www.kexinjianji.com' + url
          content = get_one_page(url)
          try:
            tree = etree.HTML(content)
            # 产品名称
            title = tree.xpath('//*[@id="wrap"]//h1/text()')
            images = tree.xpath('//div[@class="sol_tj_left"]/a/img/@src')
            # 产品图片
            images_url = 'http://www.kexinjianji.com' + images[0]
            # 性能特点
            xntd = tree.xpath('//div[@class="w"]//div/span/text()|//div[@class="w"]//div/text()')
            # 技术参数
            jscs = tree.xpath('//table')[0]
            jscs_str = etree.tostring(jscs, encoding='utf-8').decode('utf-8')
            # 产品内容
            cpnr = tree.xpath('//div[@class="describe"]/p')
            # 判断是否有这分类,没有则新建
            catgory = p_catgory[0]
            products_catgory = ProductsCategory.objects.filter(name=catgory).exists()
            if products_catgory:
              products_catgory = ProductsCategory.objects.get(name=catgory)
            else:
              products_catgory = ProductsCategory(name=catgory)
              products_catgory.save()
            print(products_catgory)
    
            # 保存产品图片
            image_content = requests.get(url=images_url)
            ext = images_url.split('.')[-1] # 获取图片类型
            filename = '{}.{}'.format(uuid.uuid4().hex[:8], ext) # 随机生成图片名字
            upload_image_file = ContentFile(image_content.content, name=filename) # 将图片保存为django类型
            product = Product(title=title[0], jscs=jscs_str, image=upload_image_file, category=products_catgory)
            product.save()
            for td in xntd:
              product_advantage = ProductAdvantage()
              product_advantage.content = td
              product_advantage.product = product
              product_advantage.save()
            for cp in cpnr:
              cp = cp.xpath('string(.)')
              product_body = ProductBody()
              product_body.body = cp
              product_body.product = product
              product_body.save()
          except Exception as e:
            print(e)
            print('出错url:' + url)

    最后自己手动处理出错url(页面没有获取到技术参数,技术参数是一张图片)

    4.总结

    1.xpath 获取标签内容时,p标签中嵌套span标签,源码如下

    div class="describe" style="position: relative;"> 
       p>span>板  宽:/span>1500mm/p> 
       p>span>板  厚:/span>4.5 mm/p> 
       p>span>出料口:/span>6口/p> 
       p>span>重  量:/span>6000 kg/p>
    /div>

    使用xpath获取p标签内容
    我想得到的效果如下
    板 宽:1500mm
    板 厚:4.5 mm
    出料口:6口
    重 量:6000 kg
    使用以下xpath 只能分开获取,不是想要的效果

    //div[@class="describe"]/p/span/text()|//div[@class="describe"]/p/text()

    百度之后找到的解决办法,使用xpath(‘string(.)')
    1.先获取所有p标签

    cpnr = tree.xpath('//div[@class="describe"]/p')

    2.使用**string(.)**获取所有标签所有文本

    cp = cp.xpath('string(.)')

    循环遍历所有p标签即可

    到此这篇关于python利用xpath爬取网上数据并存储到django模型中的文章就介绍到这了,更多相关xpath爬取网上数据存储到django模型内容请搜索脚本之家以前的文章或继续浏览下面的相关文章希望大家以后多多支持脚本之家!

    您可能感兴趣的文章:
    • django模型查询操作的实现
    • Django数据模型中on_delete使用详解
    • Django Admin后台模型列表页面如何添加自定义操作按钮
    • Django模型验证器介绍与源码分析
    • Django3中的自定义用户模型实例详解
    • Django CBV模型源码运行流程详解
    • Python Django模型详解
    上一篇:用python 绘制茎叶图和复合饼图
    下一篇:Python爬取酷狗MP3音频的步骤
  • 相关文章
  • 

    © 2016-2020 巨人网络通讯 版权所有

    《增值电信业务经营许可证》 苏ICP备15040257号-8

    python利用xpath爬取网上数据并存储到django模型中 python,利用,xpath,爬取,网上,