本文介绍使用Jupyter Lab进行全球疫情数据分析的操作方法。

说明 疫情数据来源暂时无法直接模拟,该示例仅供您在数据科学分析场景参考使用。

步骤一:初始化日志服务Client

LogClient是日志服务的Python客户端,用于管理Project、Logstore等日志服务资源。使用Python SDK发起日志服务请求,您需要初始化一个Client实例。示例代码如下所示:

# Setup basic client
# !pip install -U matplotlib
import time
import json
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import aliyun.log as sls

# 日志服务的服务入口。更多信息,请参见服务入口。
endpoint = "cn-beijing.log.aliyuncs.com"

# 阿里云访问密钥AccessKey ID和AccessKey Secret。更多信息,请参见访问密钥。
accessId = "YOUR_ACCESS_ID"
accessKey= "YOUR_ACCESS_KEY"
# Project名称。
project  = "YOUR_SLS_PROJECT"
# MetricStore名称。
metricstore = "YOUR_SLS_METRICSTORE"
# 保存巡检结果的Logstore。
sink_logstore = 'YOUR_SLS_LOGSTORE_FOR_RESULTS_WRITE' 
# 设置任务名称。
task_name = "YOUR_TASK_NAME" 
# 创建LogClient。
client = sls.LogClient(endpoint, accessId, accessKey)

步骤二:获取疫情数据

示例代码如下:
import time
import json
import pandas as pd

import matplotlib.pyplot as plt

project  = "YOUR_SLS_PROJECT"
logstore = "YOUR_SLS_LOGSTORE"


def query_logstore(query, stime, etime):

    datas = []
    for i in client.get_log_all(project, logstore, stime, etime, query=query):
        for log in i.logs:
            datas.append(log.get_contents())

    # Convert datas to pandas dataframe
    df_ret = pd.DataFrame(datas)
    return df_ret

步骤三:查询和分析疫情数据

  • 全球疫情热力图
    示例代码如下:
    # 安装依赖
    # !pip3 install folium --user
    
    etime = int(time.time())
    stime = etime - 86400 * 7
    
    query = '''
    Type : "Country/Region Cases" or Type : "Province/State Cases" | select LatLng[1] as lat, LatLng[2] as lng, Confirmed from (select  split(LatLng, ',') as LatLng, Confirmed from log l right join (select max(version) as version from log) r on  l.version =  r.version where LatLng is not null and LatLng like '%,%' limit 1000)'''
    
    df_ncp_global = query_logstore(query, stime, etime)
    
    df_ncp_global['lat'] = df_ncp_global['lat'].astype(float)
    df_ncp_global['lng'] = df_ncp_global['lng'].astype(float)
    df_ncp_global['Confirmed'] = df_ncp_global['Confirmed'].astype(float)
    data = df_ncp_global.values.tolist()
    
    from folium.plugins import HeatMap
    
    city_map = folium.Map(location=(35.4,-100.56), zoom_start=4)
    
    city_map.add_child(HeatMap(data=data))
    city_map
  • 全球累计确诊数按国家和地区排行
    示例代码如下:
    stime = etime - 86400 * 7
    
    query = '''
    Type : "Country/Region Cases" | select min(num) as num, case when num <= 10 then "Country/Region" else 'Other' end as "Country/Region", sum(Confirmed) as "Total Confirmed" from ( select row_number() over () AS num, "Country/Region", Confirmed from (select "Country/Region", Confirmed  from log l right join (select max(version) as version from log) r on  l.version = r.version order by Confirmed desc)) group by "Country/Region" order by  num
    '''
    
    df_ncp_country = query_logstore(query, stime, etime)
    df_ncp_country['Total Confirmed'] = df_ncp_country['Total Confirmed'].astype(int)
    plt.barh(df_ncp_country['Country/Region'],df_ncp_country['Total Confirmed'])
    
    df_ncp_country
  • 全球疫情态势
    示例代码如下:
    # get data lastest version
    etime = int(time.time())
    stime = etime - 86400 * 7
    
    query = '''
    * and type: "Global Cases" | select max(version) as lastest_version
    '''
    
    df_ncp_data_version = query_logstore(query, stime, etime)
    lastest_version = df_ncp_data_version.values[0][0]
    
    print(lastest_version)
  • 全球累计确诊数趋势
    示例代码如下:
    etime = int(time.time())
    stime = etime - 86400 * 7
    
    query = f'''*  and version: "{lastest_version}" and type: "Global Cases" '''
    
    df_ncp_global = query_logstore(query, stime, etime)
    
    
    datas = []
    def g(x):
        ret = json.loads(x)
        for k,v in ret.items():
            datas.append({'ds' : k, 'month' : ''.join(k.split('-')[0:2]), 'value' : v})
    df_ncp_global['Confirmed Trend'].apply(g)
    
    df_global_trend = pd.DataFrame(datas)
    df_global_month_trend = df_global_trend.groupby(['month']).agg({'value': max}).reset_index()
    
    %matplotlib inline
    import matplotlib.pyplot as plt
    
    figure = plt.figure(figsize=(18,5),dpi=98)
    
    plt.title(u"NCP Global Cases - Confirmed Trend", fontproperties='SimHei',fontsize = 15)
    
    plt.plot(df_global_month_trend['month'],
             df_global_month_trend['value'],
             label='value', color = 'r')
    
    plt.legend(loc='best')
  • 国家累计确诊数趋势
    示例代码如下:
    etime = int(time.time())
    stime = etime - 86400 * 7
    country_region = 'China'
    
    query = f'''*  and version: "{lastest_version}" and type: "Country/Region Cases" '''
    
    df_ncp_country = query_logstore(query, stime, etime)
    
    datas = []
    def g(x):
        ret = json.loads(x['Confirmed Trend'])
        for k,v in ret.items():
            datas.append({'Country/Region' : x['Country/Region'] ,'ds' : k, 'month' : ''.join(k.split('-')[0:2]), 'value' : v})
    df_ncp_country[['Confirmed Trend','Country/Region']].apply(g,axis=1)
    
    df_country_moth_trend = pd.DataFrame(datas)
    df_country_moth_trend = df_country_moth_trend.groupby(['Country/Region','month']).agg({'value' : 'max'}).reset_index()
    df_country_trend_spec = df_country_moth_trend[df_country_moth_trend['Country/Region']==country_region]
    
    %matplotlib inline
    import matplotlib.pyplot as plt
    
    figure = plt.figure(figsize=(18,5),dpi=98)
    
    plt.title(f"NCP {country_region} Cases - Confirmed Trend", fontproperties='SimHei',fontsize = 15)
    
    plt.plot(df_country_trend_spec['month'],
             df_country_trend_spec['value'],
             label='value', color = 'r')
    
    plt.legend(loc='best')
  • 确诊患者行程信息
    示例代码如下:
    etime = int(time.time())
    stime = etime - 86400 * 7
    
    query = '''
    type: travel_detail | select start_time as "开始时间", end_time as "结束时间", travel_type as "出行类型", travel_no as "车次/车牌/航班/场所名", travel_sub_no as "车厢", travel_detail as "出行描述", start_pos as "出发站", end_pos as "到达站", case when strpos(url, 'https://') = 1 then substr(url, 9) when strpos(url, 'http://') = 1 then substr(url, 8) else url end as url, author as "线索来源", '详情' as "新闻" from log l right join (select max(version) as version from log) r on l.version = r.version order by start_time desc limit 50000
    '''
    
    df_ncp_trace = query_logstore(query, stime, etime)
    df_ncp_trace