# 数据采集组件 ——Perceval
# 官方文档
https://perceval.readthedocs.io/en/latest/
# 介绍
Perceval is a Python module for retrieving data from repositories related to software development. It works with many data sources, from git repositories and GitHub projects to mailing lists, Gerrit or StackOverflow, In this chapter, you will learn the basics of working with Perceval, including how to use it to retrieve information from some kinds of repositories. You’re on your way to software development analysis!
perceval
就是 grimoirelab
的数据源采集组件,也就是最基础的组件。
# 安装
pip3 install perceval
或者也可以用 docker 方式:
docker run -it grimoirelab/perceval
# 食用方式
这个工具说白了就是采集工具,它支持的后端有:
askbot Fetch questions and answers from Askbot site
bugzilla Fetch bugs from a Bugzilla server
bugzillarest Fetch bugs from a Bugzilla server (>=5.0) using its REST API
confluence Fetch contents from a Confluence server
discourse Fetch posts from Discourse site
dockerhub Fetch repository data from Docker Hub site
gerrit Fetch reviews from a Gerrit server
git Fetch commits from Git
github Fetch issues, pull requests and repository information from GitHub
gitlab Fetch issues, merge requests from GitLab
gitter Fetch messages from a Gitter room
googlehits Fetch hits from Google API
groupsio Fetch messages from Groups.io
hyperkitty Fetch messages from a HyperKitty archiver
jenkins Fetch builds from a Jenkins server
jira Fetch issues from JIRA issue tracker
launchpad Fetch issues from Launchpad issue tracker
mattermost Fetch posts from a Mattermost server
mbox Fetch messages from MBox files
mediawiki Fetch pages and revisions from a MediaWiki site
meetup Fetch events from a Meetup group
nntp Fetch articles from a NNTP news group
pagure Fetch issues from Pagure
phabricator Fetch tasks from a Phabricator site
pipermail Fetch messages from a Pipermail archiver
redmine Fetch issues from a Redmine server
rocketchat Fetch messages from a Rocket.Chat channel
rss Fetch entries from a RSS feed server
slack Fetch messages from a Slack channel
stackexchange Fetch questions from StackExchange sites
supybot Fetch messages from Supybot log files
telegram Fetch messages from the Telegram server
twitter Fetch tweets from the Twitter Search API
有点意外,我没有想到连 Twitter 都能支持。。。。
# git
time perceval git https://github.com/grimoirelab/perceval.git \
--git-path /tmp/perceval.git > /tmp/perceval.test
这条命令会将拉取到的 git commit 数据保存到对应了文件,格式为 json。
同时还支持 python 脚本的方式:
#! /usr/bin/env python3 | |
from perceval.backends.core.git import Git | |
# url for the git repo to analyze | |
repo_url = 'http://github.com/grimoirelab/perceval.git' | |
# directory for letting Perceval clone the git repo | |
repo_dir = '/tmp/perceval.git' | |
# create a Git object, pointing to repo_url, using repo_dir for cloning | |
repo = Git(uri=repo_url, gitpath=repo_dir) | |
# fetch all commits as an iterator, and iterate it printing each hash | |
for commit in repo.fetch(): | |
print(commit['data']['commit']) |
# github
perceval github grimoirelab perceval --sleep-for-rate \
-t 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 > ~/tmp/github.json
通过 perceval backend repoOwner 这种方式来获取数据
通过 --category issue
来指定获取 issue 信息,值得注意的是 github 会把 pr 当做 issue 进行处理,所以也会自动得到所有 pr 信息。
{
"backend_name": "GitHub",
"backend_version": "0.27.0",
"category": "issue",
"classified_fields_filtered": null,
"data": {
"active_lock_reason": null,
"assignee": null,
"assignee_data": {},
"assignees": [],
"assignees_data": [],
"author_association": "CONTRIBUTOR",
"body": "Based on Sphynx, prepared for ReadTheDocs.\n\nRight now, this produces (from jgbarah/perceval repository) [this documentation in ReadTheDocs](http://perceval.readthedocs.org). Once this PR is accepted, I plan to switch ReadTheDocs to point to this repostory (master branch), so that the documentation gets rebuilt every time changes are made to the source code.\n\nThe configuration (docs/conf.py) include lines for running sphinx-apidoc, which generates automatically the docs/perceval.rst file, which is the entry point for the automatically generated documentation, produced based on the docstring comments in the source code.\n\nThe file index.rst is still a bare bones schema. It should be completed in a later patch, with more detailed information about Perceval itself.\n",
"closed_at": "2016-01-04T13:51:56Z",
"comments": 0,
"comments_data": [],
"comments_url": "https://api.github.com/repos/chaoss/grimoirelab-perceval/issues/3/comments",
"created_at": "2016-01-03T23:46:04Z",
"draft": false,
"events_url": "https://api.github.com/repos/chaoss/grimoirelab-perceval/issues/3/events",
"html_url": "https://github.com/chaoss/grimoirelab-perceval/pull/3",
"id": 124679251,
"labels": [],
"labels_url": "https://api.github.com/repos/chaoss/grimoirelab-perceval/issues/3/labels{/name}",
"locked": false,
"milestone": null,
"node_id": "MDExOlB1bGxSZXF1ZXN0NTQ5MzUxODA=",
"number": 3,
"performed_via_github_app": null,
"pull_request": {
"diff_url": "https://github.com/chaoss/grimoirelab-perceval/pull/3.diff",
"html_url": "https://github.com/chaoss/grimoirelab-perceval/pull/3",
"merged_at": null,
"patch_url": "https://github.com/chaoss/grimoirelab-perceval/pull/3.patch",
"url": "https://api.github.com/repos/chaoss/grimoirelab-perceval/pulls/3"
},
"reactions": {
"+1": 0,
"-1": 0,
"confused": 0,
"eyes": 0,
"heart": 0,
"hooray": 0,
"laugh": 0,
"rocket": 0,
"total_count": 0,
"url": "https://api.github.com/repos/chaoss/grimoirelab-perceval/issues/3/reactions"
},
"reactions_data": [],
"repository_url": "https://api.github.com/repos/chaoss/grimoirelab-perceval",
"state": "closed",
"timeline_url": "https://api.github.com/repos/chaoss/grimoirelab-perceval/issues/3/timeline",
"title": "Config files for a documentation, using Sphinx.",
"updated_at": "2016-01-04T17:42:23Z",
"url": "https://api.github.com/repos/chaoss/grimoirelab-perceval/issues/3",
"user": {
"avatar_url": "https://avatars.githubusercontent.com/u/1039693?v=4",
"events_url": "https://api.github.com/users/jgbarah/events{/privacy}",
"followers_url": "https://api.github.com/users/jgbarah/followers",
"following_url": "https://api.github.com/users/jgbarah/following{/other_user}",
"gists_url": "https://api.github.com/users/jgbarah/gists{/gist_id}",
"gravatar_id": "",
"html_url": "https://github.com/jgbarah",
"id": 1039693,
"login": "jgbarah",
"node_id": "MDQ6VXNlcjEwMzk2OTM=",
"organizations_url": "https://api.github.com/users/jgbarah/orgs",
"received_events_url": "https://api.github.com/users/jgbarah/received_events",
"repos_url": "https://api.github.com/users/jgbarah/repos",
"site_admin": false,
"starred_url": "https://api.github.com/users/jgbarah/starred{/owner}{/repo}",
"subscriptions_url": "https://api.github.com/users/jgbarah/subscriptions",
"type": "User",
"url": "https://api.github.com/users/jgbarah"
},
"user_data": {
"avatar_url": "https://avatars.githubusercontent.com/u/1039693?v=4",
"bio": null,
"blog": "http://gsyc.es/~jgb",
"company": null,
"created_at": "2011-09-09T21:47:40Z",
"email": null,
"events_url": "https://api.github.com/users/jgbarah/events{/privacy}",
"followers": 100,
"followers_url": "https://api.github.com/users/jgbarah/followers",
"following": 0,
"following_url": "https://api.github.com/users/jgbarah/following{/other_user}",
"gists_url": "https://api.github.com/users/jgbarah/gists{/gist_id}",
"gravatar_id": "",
"hireable": null,
"html_url": "https://github.com/jgbarah",
"id": 1039693,
"location": null,
"login": "jgbarah",
"name": "Jesus M. Gonzalez-Barahona",
"node_id": "MDQ6VXNlcjEwMzk2OTM=",
"organizations": [
{
"avatar_url": "https://avatars.githubusercontent.com/u/1843608?v=4",
"description": null,
"events_url": "https://api.github.com/orgs/MetricsGrimoire/events",
"hooks_url": "https://api.github.com/orgs/MetricsGrimoire/hooks",
"id": 1843608,
"issues_url": "https://api.github.com/orgs/MetricsGrimoire/issues",
"login": "MetricsGrimoire",
"members_url": "https://api.github.com/orgs/MetricsGrimoire/members{/member}",
"node_id": "MDEyOk9yZ2FuaXphdGlvbjE4NDM2MDg=",
"public_members_url": "https://api.github.com/orgs/MetricsGrimoire/public_members{/member}",
"repos_url": "https://api.github.com/orgs/MetricsGrimoire/repos",
"url": "https://api.github.com/orgs/MetricsGrimoire"
},
{
"avatar_url": "https://avatars.githubusercontent.com/u/1918070?v=4",
"description": null,
"events_url": "https://api.github.com/orgs/Bitergia/events",
"hooks_url": "https://api.github.com/orgs/Bitergia/hooks",
"id": 1918070,
"issues_url": "https://api.github.com/orgs/Bitergia/issues",
"login": "Bitergia",
"members_url": "https://api.github.com/orgs/Bitergia/members{/member}",
"node_id": "MDEyOk9yZ2FuaXphdGlvbjE5MTgwNzA=",
"public_members_url": "https://api.github.com/orgs/Bitergia/public_members{/member}",
"repos_url": "https://api.github.com/orgs/Bitergia/repos",
"url": "https://api.github.com/orgs/Bitergia"
},
{
"avatar_url": "https://avatars.githubusercontent.com/u/2191340?v=4",
"description": null,
"events_url": "https://api.github.com/orgs/VizGrimoire/events",
"hooks_url": "https://api.github.com/orgs/VizGrimoire/hooks",
"id": 2191340,
"issues_url": "https://api.github.com/orgs/VizGrimoire/issues",
"login": "VizGrimoire",
"members_url": "https://api.github.com/orgs/VizGrimoire/members{/member}",
"node_id": "MDEyOk9yZ2FuaXphdGlvbjIxOTEzNDA=",
"public_members_url": "https://api.github.com/orgs/VizGrimoire/public_members{/member}",
"repos_url": "https://api.github.com/orgs/VizGrimoire/repos",
"url": "https://api.github.com/orgs/VizGrimoire"
},
{
"avatar_url": "https://avatars.githubusercontent.com/u/3017044?v=4",
"description": null,
"events_url": "https://api.github.com/orgs/AlertProject/events",
"hooks_url": "https://api.github.com/orgs/AlertProject/hooks",
"id": 3017044,
"issues_url": "https://api.github.com/orgs/AlertProject/issues",
"login": "AlertProject",
"members_url": "https://api.github.com/orgs/AlertProject/members{/member}",
"node_id": "MDEyOk9yZ2FuaXphdGlvbjMwMTcwNDQ=",
"public_members_url": "https://api.github.com/orgs/AlertProject/public_members{/member}",
"repos_url": "https://api.github.com/orgs/AlertProject/repos",
"url": "https://api.github.com/orgs/AlertProject"
},
{
"avatar_url": "https://avatars.githubusercontent.com/u/16151805?v=4",
"description": "",
"events_url": "https://api.github.com/orgs/grimoirelab/events",
"hooks_url": "https://api.github.com/orgs/grimoirelab/hooks",
"id": 16151805,
"issues_url": "https://api.github.com/orgs/grimoirelab/issues",
"login": "grimoirelab",
"members_url": "https://api.github.com/orgs/grimoirelab/members{/member}",
"node_id": "MDEyOk9yZ2FuaXphdGlvbjE2MTUxODA1",
"public_members_url": "https://api.github.com/orgs/grimoirelab/public_members{/member}",
"repos_url": "https://api.github.com/orgs/grimoirelab/repos",
"url": "https://api.github.com/orgs/grimoirelab"
}
],
"organizations_url": "https://api.github.com/users/jgbarah/orgs",
"public_gists": 0,
"public_repos": 41,
"received_events_url": "https://api.github.com/users/jgbarah/received_events",
"repos_url": "https://api.github.com/users/jgbarah/repos",
"site_admin": false,
"starred_url": "https://api.github.com/users/jgbarah/starred{/owner}{/repo}",
"subscriptions_url": "https://api.github.com/users/jgbarah/subscriptions",
"twitter_username": null,
"type": "User",
"updated_at": "2022-02-16T16:51:43Z",
"url": "https://api.github.com/users/jgbarah"
}
},
"origin": "https://github.com/grimoirelab/perceval",
"perceval_version": "0.17.16",
"search_fields": {
"item_id": "124679251",
"owner": "grimoirelab",
"repo": "perceval"
},
"tag": "https://github.com/grimoirelab/perceval",
"timestamp": 1645709133.739694,
"updated_on": 1451929343.0,
"uuid": "c403532b196ed4020cc86d001feb091c009d3d26"
}
# 获取器的架构
Client: interacts directly with the data source.
Backend: orchestrates the fetching process by using the Client.
CommandLine: defines the arguments to initialize and run the Backend from the command line.