爬取bilibili的番剧

原创
admin

本次测试为java环境

1.通过浏览器抓包得到api

https://api.bilibili.com/pgc/season/index/result?season_version=-1&area=-1&is_finish=-1©right=-1&season_status=-1&season_month=-1&year=-1&style_id=-1&order=3&st=1&sort=0&page=1&season_type=1&pagesize=20&type=1
其中page字段为分页


再来看返回的为json数据

爬取bilibili的番剧

显然我们要的数据都在data中

大概步骤就出来了:

  • 对api进行请求取出data
  • 通过获取has_next是否为1来判断获取完
  • 获取JSONArray 类型的 list 字段中的番剧数据
  • 遍历list
  • 存入每一项进数据库
  • page+1进行下一次请求

1.请求api

我用的okhttp框架获取response

  Response getResponse() {
    String url = "https://api.bilibili.com/pgc/season/index/result?season_version=-1&area=-1&is_finish=-1&copyright=-1&season_status=-1&season_month=-1&year=-1&style_id=-1&order=3&st=1&sort=0&page=" + num + "&season_type=1&pagesize=20&type=1";
    OkHttpClient okHttpClient = new OkHttpClient();
    //2.创建Request对象,设置一个url地址(百度地址),设置请求方式。
    Request request = new Request.Builder().addHeader("Accept", "application/json, text/plain, */*")
            .addHeader("Accept-Language", "zh-CN,zh;q=0.9")
            .addHeader("Connection", "keep-alive")
            .addHeader("Cookie", "_uuid=C3AE0F67-145C-8785-D1A4-4F282135CBB530050infoc; buvid3=C555ED83-C23D-4B0D-903C-0462D7FD72D3155814infoc; LIVE_BUVID=AUTO3815773674375992; sid=b7j5kb6d; CURRENT_FNVAL=16; stardustvideo=1; rpdid=|(kRk|RmR)m0J'ul~~Ymu~Ru; CURRENT_QUALITY=112; DedeUserID=38123367; DedeUserID__ckMd5=b0c827fbaec70b70; SESSDATA=ee5c9fe7%2C1600926436%2C31154*31; bili_jct=1c0937381c7c1ccf6685b35407d517e1; PVID=1; bp_t_offset_38123367=374255508782403914")
            .addHeader("Host", "api.bilibili.com")
            .addHeader("Origin", "https://www.bilibili.com")
            .addHeader("Referer", "https://www.bilibili.com/anime/index/")
            .addHeader("User-Agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.163 Safari/537.36")
            .addHeader("Pragma", "no-cache")
            .url(url).method("GET", null).build();
    //3.创建一个call对象,参数就是Request请求对象
    Call call = okHttpClient.newCall(request);
    //4.同步调用会阻塞主线程,这边在子线程进行
    try {
        //同步调用,返回Response,会抛出IO异常
        Response response = call.execute();
        return response;

    } catch (IOException e) {
        e.printStackTrace();
    }
    return null;
}

每次请求num+1;

2.封装成JSONObject并通过hasnext判断

    while (hasNext == 1) {
        response = getResponse();
        String re = null;
        try {
            re = new String(response.body().bytes());
        } catch (IOException e) {
            e.printStackTrace();
        }
        JSONObject jsonObject = new JSONObject(re);
        JSONObject data = jsonObject.getJSONObject("data");
        System.out.println(data.get("num"));
        num = data.getInt("num") + 1;
        hasNext = data.getInt("has_next");
       
        try {
            Thread.sleep(2000);
        } catch (InterruptedException e) {
            e.printStackTrace();
        }
    }

3.获取并遍历list

 JSONArray jsonArray = data.getJSONArray("list");
        for (Object o : jsonArray) {
            JSONObject jsonObject1 = (JSONObject) o;
        anime.saveJson(jsonObject1);
        }

存数据库

这里用到了commons-dbutils
大概这样:

 public int saveJson(JSONObject JSON) {
    String sql = "insert anime(`badge`,`cover`,`index_show`,`is_finish`,`link`,`order`,`title`) values (?,?,?,?,?,?,?) ";
    return update(sql,JSON.getString("badge"),JSON.getString("cover"),JSON.getString("index_show"),JSON.getInt("is_finish"),JSON.getString("link"),JSON.getString("order"),JSON.getString("title"));
}

主要代码:

package lin.test;

import lin.dao.impl.animeImpl;
import okhttp3.*;
import org.json.JSONArray;
import org.json.JSONObject;

import java.io.IOException;

/**
* @author lin945
* @date 2020/4/5 20:28
*/
public class HttpThead implements Runnable {
   private animeImpl anime = new animeImpl();
   private int hasNext = 1;
   private int num = 1;
   Response response;

   public HttpThead() {
   }

   public void run() {
       while (hasNext == 1) {
           response = getResponse();
           String re = null;
           try {
               re = new String(response.body().bytes());
           } catch (IOException e) {
               e.printStackTrace();
           }
           JSONObject jsonObject = new JSONObject(re);
           JSONObject data = jsonObject.getJSONObject("data");
           System.out.println(data.get("num"));
           num = data.getInt("num") + 1;
           hasNext = data.getInt("has_next");
           JSONArray jsonArray = data.getJSONArray("list");
           for (Object o : jsonArray) {
               JSONObject jsonObject1 = (JSONObject) o;
           anime.saveJson(jsonObject1);
           }
           try {
               Thread.sleep(2000);
           } catch (InterruptedException e) {
               e.printStackTrace();
           }
       }
   }

   Response getResponse() {
       String url = "https://api.bilibili.com/pgc/season/index/result?season_version=-1&area=-1&is_finish=-1&copyright=-1&season_status=-1&season_month=-1&year=-1&style_id=-1&order=3&st=1&sort=0&page=" + num + "&season_type=1&pagesize=20&type=1";
       OkHttpClient okHttpClient = new OkHttpClient();
       //2.创建Request对象,设置一个url地址(百度地址),设置请求方式。
       Request request = new Request.Builder().addHeader("Accept", "application/json, text/plain, */*")
               .addHeader("Accept-Language", "zh-CN,zh;q=0.9")
               .addHeader("Connection", "keep-alive")
               .addHeader("Cookie", "_uuid=C3AE0F67-145C-8785-D1A4-4F282135CBB530050infoc; buvid3=C555ED83-C23D-4B0D-903C-0462D7FD72D3155814infoc; LIVE_BUVID=AUTO3815773674375992; sid=b7j5kb6d; CURRENT_FNVAL=16; stardustvideo=1; rpdid=|(kRk|RmR)m0J'ul~~Ymu~Ru; CURRENT_QUALITY=112; DedeUserID=38123367; DedeUserID__ckMd5=b0c827fbaec70b70; SESSDATA=ee5c9fe7%2C1600926436%2C31154*31; bili_jct=1c0937381c7c1ccf6685b35407d517e1; PVID=1; bp_t_offset_38123367=374255508782403914")
               .addHeader("Host", "api.bilibili.com")
               .addHeader("Origin", "https://www.bilibili.com")
               .addHeader("Referer", "https://www.bilibili.com/anime/index/")
               .addHeader("User-Agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.163 Safari/537.36")
               .addHeader("Pragma", "no-cache")
               .url(url).method("GET", null).build();
       //3.创建一个call对象,参数就是Request请求对象
       Call call = okHttpClient.newCall(request);
       //4.同步调用会阻塞主线程,这边在子线程进行
       try {
           //同步调用,返回Response,会抛出IO异常
           Response response = call.execute();
           return response;

       } catch (IOException e) {
           e.printStackTrace();
       }
       return null;
   }
}

最后数据库的数据大概这样:

爬取bilibili的番剧

版权协议须知!

本篇文章来源于 林唯心 ,如本文章侵犯到任何版权问题,请立即告知本站,本站将及时予与删除并致以最深的歉意

329 0 2020-04-05


分享:
JAVAEE 入门de笔记

现在学习JAVAEE的知识希望能记录下来,学的越多越发显得...

阅读全文
简单的一言爬虫

前言介绍一个比较好玩的api 一言hitokoto看着这么...

阅读全文
icon_mrgreen.gificon_neutral.gificon_twisted.gificon_arrow.gificon_eek.gificon_smile.gificon_confused.gificon_cool.gificon_evil.gificon_biggrin.gificon_idea.gificon_redface.gificon_razz.gificon_rolleyes.gificon_wink.gificon_cry.gificon_surprised.gificon_lol.gificon_mad.gificon_sad.gificon_exclaim.gificon_question.gif
萌ICP备20201111号
  • 默认
  • 护眼
  • 夜晚
  • 壁纸
  • 默认