Skip to content

Commit

Permalink
Fixed notebook
Browse files Browse the repository at this point in the history
  • Loading branch information
Raphaël Barman committed Mar 15, 2020
1 parent a161d77 commit 65e65df
Showing 1 changed file with 21 additions and 207 deletions.
228 changes: 21 additions & 207 deletions 02-Python et Mediawiki.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -58,7 +58,7 @@
},
{
"cell_type": "code",
"execution_count": 1,
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
Expand All @@ -74,7 +74,7 @@
},
{
"cell_type": "code",
"execution_count": 2,
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
Expand Down Expand Up @@ -356,7 +356,7 @@
},
{
"cell_type": "code",
"execution_count": 3,
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
Expand Down Expand Up @@ -384,31 +384,9 @@
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "8693fb90794b4a5ab5e0c574fb5d42b0",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"\n"
]
}
],
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# all_pages = []\n",
"# for r in tqdm(site.query(list='allpages')):\n",
Expand All @@ -419,29 +397,9 @@
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"[{'pageid': 228702, 'ns': 0, 'title': 'Cecil Georges-Bazile'},\n",
" {'pageid': 281912, 'ns': 0, 'title': 'Cecil Grayson'},\n",
" {'pageid': 239845, 'ns': 0, 'title': 'Cecil H. Brown'},\n",
" {'pageid': 241900, 'ns': 0, 'title': 'Cecil H. Uyehara'},\n",
" {'pageid': 539771, 'ns': 0, 'title': 'Cecil Herbert Stuart Fifoot'},\n",
" {'pageid': 379430, 'ns': 0, 'title': 'Cecil Hill'},\n",
" {'pageid': 179638, 'ns': 0, 'title': 'Cecil James Sharp'},\n",
" {'pageid': 254418, 'ns': 0, 'title': 'Cecil Jane'},\n",
" {'pageid': 462835, 'ns': 0, 'title': 'Cecil Jermyn Brown'},\n",
" {'pageid': 386899, 'ns': 0, 'title': 'Cecil John Layton Price'}]"
]
},
"execution_count": 11,
"metadata": {},
"output_type": "execute_result"
}
],
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"all_pages = load_gzip_json('./all_pages.json.gz')\n",
"all_pages[100010:100020]"
Expand All @@ -458,17 +416,9 @@
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"We now have 676372 out of 676372.\n"
]
}
],
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# def get_all_page_infos(page):\n",
"# return site('parse', page=page, prop=['wikitext','links',\n",
Expand Down Expand Up @@ -520,45 +470,9 @@
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"[{'title': 'Cecil Hill',\n",
" 'pageid': 379430,\n",
" 'revid': 1033163,\n",
" 'categories': [],\n",
" 'links': [],\n",
" 'images': [],\n",
" 'externallinks': ['https://www.wikidata.org/wiki/Q49018849',\n",
" 'https://www.wikidata.org/wiki/Q5',\n",
" 'http://data.bnf.fr/ark:/12148/cb129571151'],\n",
" 'sections': [],\n",
" 'wikitext': \"Wikidata: [https://www.wikidata.org/wiki/Q49018849 Q49018849] ([https://www.wikidata.org/wiki/Q5 Q5]) ''Uncertain identification''\\n\\nBnF ID: [http://data.bnf.fr/ark:/12148/cb129571151 129571151]\"},\n",
" {'title': 'Cecil James Sharp',\n",
" 'pageid': 179638,\n",
" 'revid': 857175,\n",
" 'categories': [],\n",
" 'links': [{'ns': 0, 'title': 'Décès', 'exists': True},\n",
" {'ns': 0, 'title': 'Londres', 'exists': True},\n",
" {'ns': 0, 'title': 'Naissance', 'exists': True},\n",
" {'ns': 0, 'title': '1859.11.22', 'exists': False},\n",
" {'ns': 0, 'title': '1924.06.28', 'exists': False}],\n",
" 'images': [],\n",
" 'externallinks': ['https://www.wikidata.org/wiki/Q3889019',\n",
" 'https://www.wikidata.org/wiki/Q5',\n",
" 'http://data.bnf.fr/ark:/12148/cb14820854n'],\n",
" 'sections': [],\n",
" 'wikitext': 'Wikidata: [https://www.wikidata.org/wiki/Q3889019 Q3889019] ([https://www.wikidata.org/wiki/Q5 Q5])\\n\\nBnF ID: [http://data.bnf.fr/ark:/12148/cb14820854n 14820854n]\\n\\n*[[1859.11.22]] / [[Londres]]. [[Naissance]] de [[Cecil James Sharp]]. [http://data.bnf.fr/ark:/12148/cb14820854n]\\n\\n*[[1924.06.28]] / [[Londres]]. [[Décès]] de [[Cecil James Sharp]]. [http://data.bnf.fr/ark:/12148/cb14820854n]'}]"
]
},
"execution_count": 7,
"metadata": {},
"output_type": "execute_result"
}
],
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"all_data = load_gzip_json('./all_data.json.gz')\n",
"all_data[100000:100002]"
Expand All @@ -582,84 +496,11 @@
},
{
"cell_type": "code",
"execution_count": 12,
"execution_count": null,
"metadata": {
"scrolled": true
},
"outputs": [
{
"data": {
"text/plain": [
"[{'ns': 0, 'title': 'Albert Cohen', 'exists': True},\n",
" {'ns': 0, 'title': 'Alberto Giacometti', 'exists': True},\n",
" {'ns': 0, 'title': 'Alfred Cortot', 'exists': True},\n",
" {'ns': 0, 'title': 'André Breton', 'exists': True},\n",
" {'ns': 0, 'title': 'Audrey Hepburn', 'exists': True},\n",
" {'ns': 0, 'title': 'Ayrton Senna', 'exists': True},\n",
" {'ns': 0, 'title': 'Bio', 'exists': True},\n",
" {'ns': 0, 'title': 'Bjorn Borg', 'exists': True},\n",
" {'ns': 0, 'title': 'Charles Aznavour', 'exists': True},\n",
" {'ns': 0, 'title': 'Charlie Chaplin', 'exists': True},\n",
" {'ns': 0, 'title': 'Claude Monet', 'exists': True},\n",
" {'ns': 0, 'title': 'Constantin Regamey', 'exists': True},\n",
" {'ns': 0, 'title': 'David Bowie', 'exists': True},\n",
" {'ns': 0, 'title': 'Emile Zola', 'exists': True},\n",
" {'ns': 0, 'title': 'Enzo Ferrari', 'exists': True},\n",
" {'ns': 0, 'title': 'Ferdinand Hodler', 'exists': True},\n",
" {'ns': 0, 'title': 'François Perréard', 'exists': True},\n",
" {'ns': 0, 'title': 'Friedrich Dürrenmatt', 'exists': True},\n",
" {'ns': 0, 'title': 'Gabrielle Antille', 'exists': True},\n",
" {'ns': 0, 'title': 'Gustave Altherr', 'exists': True},\n",
" {'ns': 0, 'title': 'Hans Ruedi Giger', 'exists': True},\n",
" {'ns': 0, 'title': 'Hermann Hesse', 'exists': True},\n",
" {'ns': 0, 'title': 'Jacques Chirac', 'exists': True},\n",
" {'ns': 0, 'title': 'Jean-Luc Godard', 'exists': True},\n",
" {'ns': 0, 'title': 'Jean-Paul Sartre', 'exists': True},\n",
" {'ns': 0, 'title': 'Jean-Pierre Bregnard', 'exists': True},\n",
" {'ns': 0, 'title': 'Jean Calvin', 'exists': True},\n",
" {'ns': 0, 'title': 'Jeanne Hersch', 'exists': True},\n",
" {'ns': 0, 'title': 'John Fitzgerald Kennedy', 'exists': True},\n",
" {'ns': 0, 'title': 'John Lennon', 'exists': True},\n",
" {'ns': 0, 'title': 'Juan Manuel Fangio', 'exists': True},\n",
" {'ns': 0, 'title': 'Julien Perrot', 'exists': True},\n",
" {'ns': 0, 'title': 'Kurt Cobain', 'exists': True},\n",
" {'ns': 0, 'title': 'Louis De Funès', 'exists': True},\n",
" {'ns': 0, 'title': 'Louis Lumière', 'exists': True},\n",
" {'ns': 0, 'title': 'Louise Michel', 'exists': True},\n",
" {'ns': 0, 'title': 'Mahatma Gandhi', 'exists': True},\n",
" {'ns': 0, 'title': 'Marcel Jufer', 'exists': True},\n",
" {'ns': 0, 'title': 'Marguerite Duras', 'exists': True},\n",
" {'ns': 0, 'title': 'Marguerite Yourcenar', 'exists': True},\n",
" {'ns': 0, 'title': 'Mario Botta', 'exists': True},\n",
" {'ns': 0, 'title': 'Markus Kamber', 'exists': True},\n",
" {'ns': 0, 'title': 'Maurice Cosandey', 'exists': True},\n",
" {'ns': 0, 'title': 'Max Planck', 'exists': True},\n",
" {'ns': 0, 'title': 'Michael Jackson', 'exists': True},\n",
" {'ns': 0, 'title': 'Nicolas II', 'exists': True},\n",
" {'ns': 0, 'title': 'Otto von Bismarck', 'exists': True},\n",
" {'ns': 0, 'title': 'Pablo Picasso', 'exists': True},\n",
" {'ns': 0, 'title': 'Patrice Borcard', 'exists': True},\n",
" {'ns': 0, 'title': 'Patrice Haesslein', 'exists': True},\n",
" {'ns': 0, 'title': 'Philippe Jaccottet', 'exists': True},\n",
" {'ns': 0, 'title': 'Pierre de Coubertin', 'exists': True},\n",
" {'ns': 0, 'title': 'Rod Laver', 'exists': True},\n",
" {'ns': 0, 'title': 'Roger Monney', 'exists': True},\n",
" {'ns': 0, 'title': 'Salvador Dalí', 'exists': True},\n",
" {'ns': 0, 'title': 'Sam Humbert', 'exists': True},\n",
" {'ns': 0, 'title': 'Serge Gainsbourg', 'exists': True},\n",
" {'ns': 0, 'title': 'Simone de Beauvoir', 'exists': True},\n",
" {'ns': 0, 'title': 'Stanley Kubrick', 'exists': True},\n",
" {'ns': 0, 'title': 'Victor Hugo', 'exists': True},\n",
" {'ns': 0, 'title': 'William Grenier', 'exists': True},\n",
" {'ns': 0, 'title': 'Wolfgang Amadeus Mozart', 'exists': True},\n",
" {'ns': 0, 'title': 'Yuri Gagarin', 'exists': True}]"
]
},
"execution_count": 12,
"metadata": {},
"output_type": "execute_result"
}
],
"outputs": [],
"source": [
"site('parse', page='Biographies', prop='links')['parse']['links']"
]
Expand All @@ -673,7 +514,7 @@
},
{
"cell_type": "code",
"execution_count": 16,
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
Expand All @@ -686,36 +527,9 @@
},
{
"cell_type": "code",
"execution_count": 20,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"There is 497586 pages about humans.\n"
]
},
{
"data": {
"text/plain": [
"['Alan Henry',\n",
" 'Alan Henry Linton',\n",
" 'Alan Herdman',\n",
" 'Alan Heuser',\n",
" 'Alan Hewitt',\n",
" 'Alan Hills',\n",
" 'Alan Hindley',\n",
" 'Alan Hirshfeld',\n",
" 'Alan Hodson',\n",
" 'Alan Holden']"
]
},
"execution_count": 20,
"metadata": {},
"output_type": "execute_result"
}
],
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"print(f\"There is {len(humans)} pages about humans.\")\n",
"humans[10000:10010]"
Expand Down

0 comments on commit 65e65df

Please sign in to comment.