1.1. Pandas分析步骤
- 载入数据
- 将 请求的URL 进行 COUNT。类似如下SQL:
1
2
3
4
5
6
|
SELECT request_url,
count(*)
FROM log
GROUP BY request_url
ORDER BY count(*)
LIMIT 0, 100;
|
1.2. 代码
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
|
cat pd_ng_log_stat.py
#!/usr/bin/env python
#-*- coding: utf-8 -*-
from ng_line_parser import NgLineParser
import pandas as pd
import socket
import struct
class PDNgLogStat(object):
def __init__(self):
self.ng_line_parser = NgLineParser()
def _log_line_iter(self, pathes):
“”“解析文件中的每一行并生成一个迭代器”“”
for path in pathes:
with open(path, ‘r’) as f:
for index, line in enumerate(f):
self.ng_line_parser.parse(line)
yield self.ng_line_parser.to_dict()
def load_data(self, path):
“”“通过给的文件路径加载数据生成 DataFrame”“”
self.df = pd.DataFrame(self._log_line_iter(path))
def url_req_stat(self):
“”“统计那个页面点击量”“”
group_by_cols = [‘request_url’] # 需要分组的列,只计算和显示该列
# 直接统计次数
url_req_grp = self.df[group_by_cols].groupby(
self.df[‘request_url’])
return url_req_grp.agg([‘count’])[‘request_url’].sort_values(by=‘count’, ascending=False)
def main():
file_pathes = [‘www.ttmark.com.access.log’]
pd_ng_log_stat = PDNgLogStat()
pd_ng_log_stat.load_data(file_pathes)
# 统计页面点击量
print pd_ng_log_stat.url_req_stat()
if __name__ == ‘__main__’:
main()
|
运行统计和输出结果
1
2
3
4
5
6
7
8
9
10
11
12
13
|
python pd_ng_log_stat.py
count
request_url
/wp–admin/admin–ajax.php 246361
/tag/ 126012
/ 57325
......
/chufang/2016/06/25/8634.html 2312
/chufang/2015/03/26/4686.html 2293
/jiaju/2014/12/05/1348.html 2230
[29205 rows x 1 columns]
|
文章转载来自:ttlsa.com