From 2494341b9f0814477c7c4543c4a6d2207ed1e97e Mon Sep 17 00:00:00 2001 From: machaerus Date: Sat, 13 Jan 2018 14:52:36 +0100 Subject: [PATCH 1/3] updated issues: applying settings and displaying citations --- __pycache__/scholar.cpython-36.pyc | Bin 0 -> 35395 bytes scholar.py | 8 ++++---- 2 files changed, 4 insertions(+), 4 deletions(-) create mode 100644 __pycache__/scholar.cpython-36.pyc diff --git a/__pycache__/scholar.cpython-36.pyc b/__pycache__/scholar.cpython-36.pyc new file mode 100644 index 0000000000000000000000000000000000000000..118340c84ff2876b769e05c16fe8c5b49f8bbca8 GIT binary patch literal 35395 zcmchA3v?V=T3%Pb)oRIYS(asw$5W&6OJ1w}9NXiuXY8?Mk7qry*V4RtX12H0Rgzk2 zb<4L(wpEdLm-VnrmK~VDh6LCo%aXIe0djIUoK14h!U=?Ifb7C1ECh;W;Q%2Sl5BuL z9zz28zW-KLKh&O?J#dg+x2ta5y7k|C|Np=L>)tvxFp!)4#!IC?_*b#m@5S2wcHsOR z4(G*qEM~<@G0U*xb)#g+Z@d(j-$W@PzsXWkep98?+HssE>gnZ7DYKj{WtaO(eFmRS z)^p4KrT*o7DKFQl`oQvFY0!wh5Nlja#_UZ?IrWc?82_Pkx?TvMW=%m2o4aZl(lxvk zYs6oStrhTGroLr)xHK%aPU%|TG*nX88TpY3bc&2OC zDRQ@2#`c(%TZ*I3-S%i{yVY;yUyGG?G_ux!HORBeF>8}mcr9iX><8{8N;`4A*&4$2 zki8Sv58`@@HH_1i-^BGcYZTX`a{Vx_w_7`Ky+f|YaQ%R_6W2TC z`Vm||XzjxFF1g-~>xV28*QUK2_3gp+!`2wC$K-l1t{<^><9fH0--qix)?QriwfCX? zeq8Uf_TzfLT#w^=+&X~k15*9~uHR=J#PvbBexEgA718d4)+y_Vb@Vl(G+`aHjw7dN zJ!+l6?;-oJbrL-mS2OHa`_R1@d)A6i$HpJ?empm`P;<=XrnOqP%@x(WRkLiztkx@z zgWtKPGHNKd zw;w&U&|J0;Emx`w6xfod-*c;I_;io2*YAWyHr|@cxoPwH_##)4qHcJ$z0Bp8gZd8`-a@o(7%NnzB zo-dbgu2$;7olLoGHLK-v8?@|wB_I#u?8}6PW0y4)ufHj9`!6-sa>dm^pE$2c$gHS& z8AuaE<=C!Enz~&PB12&pm{mh>f@Z3q8MZ`i#f#MllWj~!nFyGzL;&878~+YR2r#d| zDS(GQKq5aM0I1<Dqjwscdg-su{kM zcxvBSsn@DCGIVFfuGZ#iwl$st?XNk0M$?(&XKz=exsE?LSF5|W3a=IUADD4^xuTX< zSNt^R4iIf8D*JuQl_k6EU^KtRQOF^QrE}@5kq6`Owzds(F)B%Jn(jp`ejbDD)~b3e z)Na((_C9&D(9Z+7@f$dVM`evoFRdGar5s%uBtagO7;t;!BnMptQ-1Ov)GVV{!_?s?VyY~JoljUb{ z^U{^8lRbNWFIxBw91h2UJ)bn>Uox3gyw@=TDP)|(;haL!IRd0fu!Z!eAqv|mD{p13 zK4B1PT;;6(*Fc{b8IH&RVAdn@oLj3q-qG{sZB<#}s8HOwRZ+F(s$;IYcEiDa&{_qu z4CZ7|thoTGXRWA5hwMUeh~6Tv!o-u3Imk~E<^5!>G1v4{mOZ;V-^pV7QThT7=Kzwm zY$y{XF&#q&->qd~HxAZfe9X|YuT03m@`RtLHC#XUe5Jl>3!7Ht^&qPRW*0xP?98`y zGyBxfp&6^nF5|g0P*}u}7!W*b3>x0H_MRxVN_MbBcDulP9VZYPz!DeZ_hL&KHAJ|0 zy&l^R;bAekgi({c)M9!uVy!!>F$z5y*WR?zM#Akiw^?j3OIHeUZJBuP>if4+lxHZIo}M+%2dqiiE0E%p!X`Ups+5|+Wm%h@m6=7i8V%W zZMf1^_l$X{cxvlx1%8(j1kWPjaXr`!9h5bx1nQPJMx zNcAQHeMkd$Rs)B=>li!%sRuBYUDv>zBQWd~>X|)2i3;`xg+oVB$QQs7HKgy zzWKxpF{^Jey`B^eCN~vu;o8`cBzf@whbq4WiY=v5Tx>K8FSTwzb(Ud-Fq zh0GPF?A~#UXX?#rrS3d|6K%XeWkkCO1kwU~ML;0M&x5jF9Y!Cpcfh!}OAq{=_{}5- zOc*vsi)gI2AOY>gi-&sw_;&6LCo{-(D^I*X9I7EyJ$v?8j=5)#BaB8~ zu1@o<83-q$i}^{uL}<%T-=fgwKuD|H@zWGI8>MVmjRVY3%el->hhP~Hr69n!nukE zS&Gv@qOr~8qzkNbjn^YQB*tk8#7e&r=at5~bwgbOjwRRQ#4}>05eu)sEj9@PA3BP2VZ0>*-D zg`^iXn?Qr`sM>_nZH4EgX3(WfC@q8A&@E~EJvD18wCXBQ(Dsn!h{oaIu@*Ts#RtZCvnE~_W zO4fpzbUg0;a%3~PVbknFHPMDCOhjoaiWQUgC99$=GnA94Uc%5vdk>7qtmF`u0$b6xvjMI+|-vo{0L@`K^X#nKz**tTL2d@d<+lktI`3e<}n zHflNQ6(%2KQfBfYCd*8S#cGv_K+;MC*EurV2n+c{HkmZCBiU3okxgflNb|XV#S&E< z(O&{*=WsY5N7A8PJxB!uT9Q~1pe14Mw6ge3TTn3Yn+aS^p`)o!yP2S0iF>KkZ|AjR z33i47oQqQl=Yu#GhZ4>=X;)IIpj}9%&0@UV2%P# zgj7+nDl4K50temOb_LkG3`&H#F;oVd&>c*-2206YvtDoBhMI>|>v)|e5Yjyd$Du7v zlrWuwwX6)&#tI=UNkaks5;)I@Kn9P*&?~e1pFDl^@cNO*#$U1i*3rAK%)-Q$q`lD3 z1mvd;1@A-6xPd9vk1IHTIMdglFlDGKm6aU|b!#ZshF*eMM9pPl7TIsiNaJ7L@E+_n z7QMBUh;eW(3s}Z?M#M&YueN@IQ`~AU!O`ZDB|sLc&sM5S)UR#$eAvX@9*x;p`e4>imXLl+9e@t9#Gu) zMAWThWjut8h_GUZ)8s<^4{C!EyYqyo5JF1#45$SMA+*F6GwX2?(8M|iK`pk}N5xs~ z##xS6ynAyu6a#U;1+0jk3GFS`t_|w-wTg>QVW-kgguWMs1U>@mse2H7#jZ3s2=of0 zMAXYc362nbsPbE`AJZF^mtTZbk{L&)haT=ooE;lRSWUeGw7|U z@qRJ!Yp1|%- z8FdGbst+Rxv}!-O;4atwgjKuMjUFj^`$vfpWJ0lYK9NlfLyXV%P-d^o98r`RU*he6AtbVM7=753-g8(uqDf^%+Z{6+txCPybYO#+Z!1xQ=0p9PdM9-1*VkbNw_uKZ zgPH*J0OrAi4$`tRFoeIpb`-hLz!#JEV${3uCe{;+DasS~;&&5nntBrQNRBv@keN|9 z0kUr2TpW&ok84Ljn**SKN_`t(^5ZqDl%03VmD%#L1r;|*C!om8Gc?1w_8qsSk-lvl zR4$%^U0tv|Nvu{MLyoA)=uUX`Tt5RwS+Bst_$cpu6p5d^0?*+c?PWND3^=4kO-+Ch z{48QJ%JQa<@PV#j_!i`SKZjuiCoy9fIGm3U!|Yv1Kh8l?s z%ABHrCK{TXxxt8bpai8zad ze)TX0I=+r5Ex`5po1X&$B&-44|D3>;um%Aa&-*nB-2B`Nv73Y7yU_l|H%+N;M@#jX zr!d}_p*fNmWJTN*J_d#YSDt!5uDxUtMqNcOjh~zY3x@c!Qfc^nbgn+07janAP-dax z(26TB0MjCCLl&q427EuhaLhLdzmivvCp;Rx{~ zGNCQtEpHlkK1A$z(|GlH;0H!W#$m_JEXIir{e;6aVh2e&tB%VZxyCrbIH7)Bo=Rbq zT$D^AH;r5uBbJiuG)Q|Szz`@5f+27{;(lLPGmIfJ=UO?r3r7v`Ck=HXb5GdA-Nc20S&eK&ZdR;>2-3i878l$?Gl#_>YkHGeq>gIEewh ziPED$^@1@H-;v#w8i5fahw~w02P}5G;Dq!Z?kQEbQ(1)SoaN&`tARgU!66xS)(d>?=TOu=OCf#dG@#e6Lxg2)3IHP&s zlGZ#oNuCSPgbUvytONqwm#}t^_&wzP8R6qp6$2aVHYH!zB3mypvID|?JTBu(9=Rbr z0B$t?;{@@do+jvF(zCWq(uSXC&MtOC_rs{`?Wp%c_y5k`7^lf1LN+2=C$bqD+0@`V ztYHv?*l~tqS}=%ii$OpV8{sz2VF~ZBj(9l;C4iAo2zP_pYQnn;6oPOL;{)dwv*tAR zA)KsX7rjgn@bb2C2rc^|yj@XA4-|GEyM^I&7MZ{~ zZ*btGQNkr(J2_0AoVLOP)lNhhSC0|sZ1%#S| zioN*anNYQ#K{G5$3_f)ZSJNB&b%O~&{Rm3+v0t?bH4i%o z<8y|=(ZI3pBU^XyP`c5E?r#93ZlpfSzv4PiFh~(lw5V`p4_mPV~0U8bE;!gnAQ|c4I zOSlms4`+Hv!<3W6dDuf1rb1&V*4SGX{!;L+0Z36i&q3OaFGAXZY&;;8H3)U-XF(x@ z+p&^>=0Ma0SqG&8X*#H(fEosI4*6c_DwLEr2lB`FqXX2Zm~4=fMJ(wSlUr=Qq7|mA zeD?1n8Q;{BmRo{^@P8#z&H_t*j0stf`iD%`k!Y&{>{41p7b+wcZxE1`ysL|i5$Si3 z+=#d>`FN+X!=S)W5J`C#(%zGc#=l}9GJu>J*fRx=uobm+2NDZ`wcg*Np3kH3Rsuof zqFr4v=dYDYP98nsX=~13FO`Tv`tZp^M^79&a9tN zyPL2wRQMJpO7RUyZ75{~>H!GC>W@^;yol+BmB&OgC=h&6%Hmpn2hMQ$&Lu4q+@a{- zhguQ&^#-S;yr>eJXb8OdI}p|RR$T%rO8a}YjRE&0u-YoHZBQj%!8;0zn>+Ithd{U~ zcMGRBYx7bP_HS`QRFbh;J%y)H2zoRME?SS9qQmj%<14n z31pnZ;n1ktd%o_9m)tBKJw^lT#bX;idRffyLs68lERFD<3FdV8yCR2~#0zr`$`VX! zG=2w$|6sib%%%vSg#We~&h!CCBKGi)zi6H2NFz9D_k`6z;blyQfCUm%LZ&YD5fsw? z^`358^&;-Cvco9P!f};Kr=tK6X-98&6+2Pb;t)=>AAy#V2mm)E}}Xe9!tOj0lrkA1B6e6*7PpS^oJQH)m`Xbt653t(vh~T zo7L-y4y}rPEluR47I@@-OvaRY@Qpw>fdUIRVI7Vptm|5xU9P!$sSDK=ZNZQ+cj>L8 zxZLG@rx8lFC{Nbl|S*IqrT2 zCJGuYUS}r}rQn(my~?_Sf&x=jwx55L8l!Q)fr~I`(K2A497C`etYs1{2!r<-aTd4` zf#D>VsfR`d9n3?02NMm-uOmX2OgJn*p7wq;>f-YjVx=nZ_S`n9`;%NEGeIa2GgzwQ zM$1wmt5_Vji6u2~ILiR&4lK~Pgk|V8 zj;$URu09oE4ZD$@MU#?*7~CfmNi;C>R^WF9KfS`VBogU+!@)tA+3`U3*Mn28SXMde zjWYYES5T4;Z%b;%{~TXMO;y=^w~mg{(MBl=9`gq*d#(b%TUi3nuyVk<(@kkxdc65f zf|)~|F&J5bxcHEP-51W-`+n#vDD4EZ)F+#?)GIcivoRDFJ<#IG6 zEE+ZCL%0x&4a81Zz_C7)HXIra;VU*Eh9Y14Eg+y`m4`U35C|xxlj>;;lujE4XdV;~ z3kU8%yc~nbh4mlQ%cve#J^+3jk^YR>eJa7U81PYgfDxoZ&meg2a%BSXToncF7^{jG z6y{iZAwktznI@BlGoxy^6$`nAiwKH_a}d>Il`k0W9OGDmx+55FFVGIn8j9v6;F^7> zLPig>N@Hc!%|+!*bDzVtJ_ygUQ0H8;c@Q(c+Se*EYXe@}X=UJ%vDPzVe5UIWt>o}2 zyiTtnGObk_#KjK4Ij4z86|5v6_k$CT&`Wp07v9F8-oW4q-oI^_ZyRGEdXXi?-t)J4 zuVv~9e05m*j47#<3Az-k?xb(KXvE)2l@a!Z1>324b~z~hLqs%+1hGM`kzw2%eRp|M z&{!u0^?ICgOMg$*lVXbcDJDP7a$p-{+$V=*kF(aUN2tQ7Eh zz~2%q?Pzbc_p)G(3~sP$!d#>bv;qq!Z9Q$Ca?Cb*X~^w`CQP1JF47Nan>%y`Q;zX~^B7K5kVK&FA znZ&&WjmMDeI8!IN#w=Kem7EgklB?)t3BTYOFpS+Qa-|Ft;;3{({C7?(eYXew{#1zk;he{PgbIf6W-MIs=>)vo1Ja2hS2>S<&$B4F@g z2Xc2sVV)rS2!qptH$^gf06K=EMPS~JR&KXAOBuW+&$6Y+cmY4U)u(vY)%1UhykBP1 zg;~Rp;yu)>*=-#e2_JkS8cW)mIOT|sa>x}P{V|Lt^J>Oj)>6=n6lVkqt=#it>}fT&rQ!@p(teIFsOiQ zlEj<#5}#RN($$+}6~D&bBw7)X;5~*$I-J@W_a5lguYt%*Hq>*4Yb#DreTnZAuEa6Z zT&eYXITK|5dEEV%gn{xAOk;_mI84hK#r!Ic=#N-?4u|tkk-QV{q4aE}ab$30arEKH z;poSa$1#9o5XYu%3`Q+jnzp-eZO3oFwZnPWHm? z>@?bNHpjwiEI^Y<4x%}hJy#l12_|SA-Av`N-4uS?o@K};r)9~nShu+8gcVYB6CtQr zZ?}<04#H`56AV)3ZLr6oy~wavn5Abks{#loD8vU~O%z3^znJE-bLyyF#Iyz@LD zvq4f)@CVEcX>bJzxkZ|loIb3-jLnnZ=ByObeo6CUoJi|=L;V6?By+~483cTH%^E2g z7C$FH((mCpJ6nAb&tgJhCztiI#p4`xsL^M{M#pVT+9Vss#~6x82_yX0*Dl zr;04p{pTdzQM@dihZxx#{>yP*dSD~oZ=9A;OH0QgCkPgeQS^;_M`Z?*mI8&bcKiUm z0SBV#WUdKSHbif2Fv7+ehWcgH6RI&h?_z&`AEybawGYfiEBSG6XAgLzf{_OB&Hz8& z1~9}=NXnQx4*+l&9jT8u>>kj!LH?C@fqXCmNJf634g81S34AI29@}uTXB&uIJzyup zX@j3Tr@T|ox53W}H@|22S^7P~|ER#PIhDqm(3;i*U#PqUd&C$*J0bL>qi(#(vG;Yw zTiYOi@IF1SOTI^_Te_Km|5UTdcwZp5#FJ3H2_r}#i8)#(1INPV6EKD8r7}>(gyJLl zjVoeW5*aDu-?zrc&q?7j3w1YC2t;_u_Nbky`x1y0e2>jMF0EsbC8shFMmRRXfain`=`1X@!BQN3+eBHz*U-J*f0 zHG*jZ_gCBC=6N^V8NsR5+1+hG@3;@pc>7cp<>ZgQWJVWo$o3alH#n#TdiZl7s;ZrG z1v7$8H z@fd~=k+I^N*Sp)SX=P5L#<*n@th5LDJBx)B?Fibg7W^vO6{rP_QwEaz2|U-`e*^5l z2>ku*MNT6zMk+wfhE>PBPj5yKy(<<3jsrve2f`Gr^1$L3EKJ4bJ}u#va;?9W0Bx8_ zj}b5qSIqFRevB$ryFe&7x#6Mr4%n}?08_t*^KPKFv;!5)_b1vw+WRgb1&>5>5$%6@ z92amn&p=9wbgB>GOh-KDYWl-H(XVy>vf|#<&9-V-gPC z0e6rtU1*Gn_2e1E%-&3j&bSF}>VuQ_l*N@Hn*yy7PHNE_3+oBd8uNOy1+h?jV~*fT zVDICC0PZjdrr%NtwAT0xMglw6G}e6Mux}iJwMZl%3KWX7vK9mw zX3VHZ_M;})ySqqEq=fDVJJ6M5cL9f>51JTB zE;0-~QO+Oez{}SN2eBT4O1Mfl>hX?}Q@t0Xm+~PlOkJCr^b6tDjmb+dT)Xl9=@zaa z=h^d9*xW9_oyn<-Q`crD)gR&Inw$PH&;AFKKV{NF9XT6h-hV}65$0i>Fg`W<7pFr5 z21S;r;)woe&^d?0`5cmV?%3|kD5d2J&bjwuRxD+(nXv2{ zgY?~5aN3)dP+^3U)wz0I>fkaChG;n4^B4Yqh!)`z)Rt;P4YYkhs@|qdo)7d-?!HQU z4732exy~qEx)1~7qt;PDO*WSnV~MJ$cJR@)H{*Pu$wtixjFUs51BaU>$vpC;x814M zYI6%`k7%o&SQBB&JACj_ucJ5uz2}@Kz4*#o5DNe^|{C zRMQt+tek}bQ7rV@P2>$qJwYv~qfgg2htuN$H1$ej{w$1Dds+LmT-5uD8JzpTH>7bGJofW{dvmJ3sz>VT{Ss&5C z3|Z|396~3;3^4(~o5d{w+Crlk6ehIiXJ~pxWg+waKfKeTKp{TPRzH|Pr6egb>u8Zc zfRF(s?=oXikRr39c#LovoQVp-bH;P>%0P4gnHjcA3gVvxo_~!eLjWb9s|&!7;{Fe{ z07S6w{x-F%)m87bI-ETh^5c2cu6k7Kd=%#f|Qc`!Mo;lyI?P zaN3a5cXeCm`s*G$vvL5`N13q3wzotK(obt-_$=Q>p#tJBBuDk;i&>dD4fXLgj#gr^kJogHzQMhG5tL3m<-Wd%s6CB! zxOuF>ea(OhXT-c+avy7bMFwH{p#B$PFJX+S=zZ9DQMSPsMGEzcDite3eF7W>!Gp{c z*XIuE-F3@W&8cF07ZDG9|N10(M63Bu* zGwI;RO70w1a$^||YgG*w*g*|~qt9osbSF(sf_uDtI&N)lEBk#x*`ZchYYU>W4Q!#n zSmcpULz%`E_T9vCpbGowT5VyyoE0A zhrDNH(lPOOeg#XmF;ig2tRl;zsPzH*o*fzV>jJLyDU0z|pNmLIdS(z}s z9g6d+!)dGSb_G~z!M^o=42_?`_NZlSIE$?zmEy@It~yrB>JzwIDj>!>tXJe*V9vMW zhJUk%hR+mv~5`N9qfQ26B4gT^rE`>1w7F4P_VfizrhhxA7|m)OumcBmzfCHm4sC6 z4@wiFvHKYk^b@!dOG2rMr?C?^Bg3)5T#DXzS|yT*4%feF2(sMTmgGJd&xa*(7K!B{ z$#=dbn)UIv7U7@41;_*hAI$lh)D6a->S&gL*pndQA{S<%w8Wr+jBz!d^1dY^8qxWK zP1##JmsCP26WESm-2D;h5YAR8Z)>Bhcb7Y5@LGiCJrH7yenmLEn$AW;g)P|C3p;^p zA_%BVr8sQdxEafM{6Sd<5YE+(X7>e_5QEI?=&vv+MP>pS?ym8L&!J)AQwV6@pK)v{ z9B@n4G)65L*|@j8OWirISU&0?KXj3sxdr7YgBM7srANY`LzC42{e>C@t(b z7)3t$Oz>p;DCWH1jz;maY^V zW%Z%?21c8RLSf5sZ3Jj{>KzhJRCVzYiMBo40=}!?!!5sQ3xIj(pm|^g8;Pp}d@6vE z%#`m5rt21t$g=gROEa1;KGaq`YE!VC8@Jl^MxtC^(v>MV=ixz&t7h7rMec)B2=96x zBiF`A1erdBybzD>;M`C0J>86AGdWiH8rk_Y1&La_*V9}k3ReawzlBxe*3g?#lDa6! zZm6OIy(r;{1TcydG6H^OfwMb9%q4J5`-Lt=L)uQ{7^dd!j=hPZIkH11j2@;yObJs~ zqK54k5_*KWopdCHcCf*sMkLX=_JDgfj#JWo{J%6dOM|2tIOFnj-K4b10;so-u?$gs3xa0^XNGxv=iIK<~ z%9&UT0dv0IUJjCnjjg;_ohV+FK3l?lA8R2#4i(!&XgRzwdOJIDqLar(;IcG%l_10% zPN)xx_=bgX(=ie15sxgn5adOur+^>u9P$+i^=Sz89}0x}VNn3UfkL64^;q!EgrdB- z9s9hc2tx$Dr0qJUsSeKrA|yhg{kFXiW`tF-LxAcb;4d+PSb;(j{Y0xm%VEw(`572( zOOD$2wII_H{?%WgK1ewz*GB4K#1U5Y5a;pj z43vlQmmgWe{w^3WTB~9ydMLF-Pa>yGkEof>A#8uLwc;d;rz3@p+ktb133z_qLd+tT zJ@S0iO59DX{k)7KwqnHIIiL!deW`MB&Pa32AiifrB*&6M?BRp8^s@FIUmCzdL1G1# zT*?!2P8WgIKODoVeAc4#@>r{nZx94E>%0NvVb7C}yg?U2Iz44vuiy6kCM-Q1u(10| zNABkKH?2dF0LBe2<={OV0nXtoYTl@B8CpBEo*4;XBAx;7G9S)p6;kkSi}!&DD}s-P zXz2wjlQ4rjkRN8Q%gja;m{GS-`mE-TNio7piUyAhYy4? zs94VXf(K6LyfojBU;UjaFL?%kUQ*C^QyZqHQJ4Na*Iu3-luO``pQL)~CmBWUr!kk{ zb+QzS-=vcLpQRCg+Cs=aas=bE+J!;bTf!ChSH9^d7Oc96$Z*=L4VwvbYJ7}jK+AkA z$t{+7#EoId!SK*wuG3z;QLNP-jwj&+817uA#z|qLoDcKRyF2SP^D}j!%PQ-x2=4m%B?h26!v2Rj}6vX&*Y+}w$ z)Uf>-IrD4S7elWiGgzAlj)ku*WZGQf#7r+r@4+28{6*nu=DV7%BG1t?mXY_7{8?vou z$QjTAMC4NNxcRjgv}tcZyPl$13jHmU>NI{O;(Z;I%})}%*FDo;C5GYQn78$BXovK7 zUbOZ>!o#^2i`KSvL9}7B?7wpBD`D%S4%M#?R^idGB&+bgnH}KYxqa2P_6@vz`BiM^ z9&YM>7;n?Q*3&pcaKg*X5Q(hHGm8nW53&MJuf?O8W&{`C3a#d_7H@=ys=Z1 z@iflWJ`9E4eL0G~BGE=RwG+6Y0E$qM141xkvR8mC7qAQvD5hSJqXdLNnZ~Y%H}e6? zCnzW1#Pdoaxdcm!Kux4od&qBwa0aV`*t(-~t)}^DJsE6hB6iTc%a=Ym_U=NU7eA5OdN`Aq#RT3)!ByY4W4_`3F zmeVvfw#wnFrTkJU9d^-MuqFWO0q`x-wsKe(fShsUz=9-iJs>&nYs)z(HBF$VuuM^M z4z=YR=J!ca)3LU?kMkR)$UAAB5*V;)gkDb=^vYw_>8Mv8M~!E!vu`8P2x?L0;s#+bea1+N$Z}ruAtmAZLNF1b=7*-nzF97_0)Ci1E}A$ zZUlKzeZao-y;u-ag}DP#reRIX_g6=853>l!X>)g-A#)HEt%rZ5T2^LTdWt5JB(b+NY<3pxfOTTQDUpRsXT(; zZ4$i|S6{>V=$mopEBM_W{O$;T9|(ST2EPvmzq^9phk{=db$<!4Dhvgb`a;GGBb_RF$%AKon=ViHbBDk|p?i`gnugIP2!JYkb zXH@PmA}yg70L!b)mHtdGj<08Ug%aCv_jUIG&#Vt(d)92@tM2;%C8PlKLPC9ZJ-c|2 z=Rx`H$oo9`Ogj5jr1HZ?S{|xdUwanm&6Ucc5IVYo> zQ!>6u^_t+&Dd59nf6mo+o@o5SmrEpKF=JhY<-A& zb7MDh9|>yylKVtZcF=tio2(=k&si0g!r4|l`G?Si&I?up1IK5lD%&w-J6a#ahiT21 zgC#q!nrFo580@3)MD9FvR;>GNSokCumvsEFI;V5FN4P9p ze*ppU^`T7y-|)j`4fr4?b_bZG|JsJSjnDyn^R?k(+X~p-@I}zmxi-iqs^$@MVj_sY z9;?*h<9AIwCl8O|RuGRr)>+9|2MpL7u{Jwnzliu?7Zzix`w}DG`3!V1#K81;{RQ*L z$&<$)J9_xoiQ}gZA3b^E$gyKb3064kn6p@&+}7qfSU|%@V;_kgt=5}1A9?KLVTRws zB0-y@4zhnF+Qb~4FKeXbre3TPHgin0$uZqyW8E-6YED$l9-ZX^W-fRz!)k&qXFs+e zdg~}A$G0pb*td8yKXkKz4`PFHC1KdtLE^VNVo#&O(j*wahn4&i68 z|9)qj(Q(kaE7+;S&ji~kV8>m6j0((-BHbN0Gq53MF10!zt=@y!6Bb*QR;j+qRjQN=vS1|*Pp&|etOb-xM%IM1SWtC54v7{ zVnT3yf@?##*()rRW$gc9U-CBVNLft-U<&m$jB6T8Xke$8{ttWpI!6=>7E1Z+&pmbZ z%7ybY`2Oj%SL}d5e+D^NIiN#@xEoAs>7SPgFdm-{oIHQSJMzwtV<0qr=nwjG9Rqto zZ

Jebn}*dLf4vcN@+&Lo}ckP*#!g`AwP(VZeYM(?{6EO4?lALY!HFUDFIwy zY}Oq!_v@*0{Gd5zH*h)Gu;``ZwOYpY4SVj{N&}Z@)|k4jb6((wlE#arfoBmg1Fsx{ zrQvPx(9ha7JwcPVpX4Woz1J?`oB{+fjVw+V5q|}y6U1!KX{0--cVMFB5Nxi^#Sz-_ zv5QC$8R)XL%3WO{L&QR4THE!A(Ipa<*CB(j%yMqAh{f{hU+mo`$PE05dDD|R7#w!^ zS#AS?2#R@e3SxDfe9V?x-T*ThgBVV0AX`9*!ub3Nx}SY16`R;90?5v{-SYY-CW0L( z5O}?S@bPdziZ+j=Hf7LC8@Lv2p*rxHTPKFhdogUkOtPV!8%WasW z5k{j)gdIhAN`4W|p~r-F2K?+rH;+H%XVH1$ee<_oL2y1KON%D-OX8e`EaB&`iO_Ie zLJs|YeRWy!D?&N?%5>-Zh1Pj_Uf-2!;ep4#-(n8hg%`^{;J5sokeCQGVhmS*!oHRu zjmLQ=#_MsOJ;jGI!5C?y32hmrevM6KiT;{k$!ko0g~@R=4L%%*1^C_rx<=NxG5I=^UuW_gOnfH4 z$>g_~MC(CHxkfWwhb_xON_k|2Z*$b|v-D$Rj0br3Q%uBCO^Z$`2hN1P;HDVk%oD+c`;^ZAWn z-WIA*v@R-I#_;7RL^9zM_4FMsA`VeP68rTY{gYfIBi|s#lSlouuEOurrIDv-=2f(! z32FFA_@7-rACL={DT1+Y;oW{Fv^I)oN6|g07jf*s8DbY_7ZJ{_gQkS`gcNC+lWm`6 zS6l5)`2zD=_|M>bO+YlGiUCriC_7QFs%oK_oDK4Y*)z*cYqf4a!7gy9<0L;4dnm$` zcs$A7Gz%C}{thMZH5B|~N*o*kt{`S2p3M}J@^5RQP}p9$Uie7tP+>>mslv6wzQWzY UZd_l>ZqH?~=g~)EV>#ph15L*k761SM literal 0 HcmV?d00001 diff --git a/scholar.py b/scholar.py index 13ccd43..a7bd414 100755 --- a/scholar.py +++ b/scholar.py @@ -239,7 +239,7 @@ class ScholarConf(object): """Helper class for global settings.""" VERSION = '2.10' - LOG_LEVEL = 1 + LOG_LEVEL = 3 MAX_PAGE_RESULTS = 10 # Current default for per-page results SCHOLAR_SITE = 'http://scholar.google.com' @@ -353,7 +353,7 @@ def as_citation(self): if you have configured the querier to retrieve a particular citation export format. (See ScholarSettings.) """ - return self.citation_data or '' + return self.citation_data.decode("utf-8", "strict") or '' class ScholarArticleParser(object): @@ -512,7 +512,7 @@ def _as_int(obj): def _path2url(self, path): """Helper, returns full URL in case path isn't one.""" - if path.startswith('http://'): + if path.startswith('http://') or path.startswith('https://'): return path if not path.startswith('/'): path = '/' + path @@ -982,7 +982,7 @@ def apply_settings(self, settings): # to Google. soup = SoupKitchen.make_soup(html) - tag = soup.find(name='form', attrs={'id': 'gs_settings_form'}) + tag = soup.find(name='form', attrs={'id': 'gs_bdy_frm'}) if tag is None: ScholarUtils.log('info', 'parsing settings failed: no form') return False From daea0ea0b8e6576dd5c6a8bab95c481f45a4ca42 Mon Sep 17 00:00:00 2001 From: machaerus Date: Sun, 14 Jan 2018 14:21:04 +0100 Subject: [PATCH 2/3] added option for returning json --- .gitignore | 1 + __init__.py | 0 __pycache__/scholar.cpython-36.pyc | Bin 35395 -> 0 bytes scholar.py | 8 +++++++- 4 files changed, 8 insertions(+), 1 deletion(-) create mode 100644 .gitignore create mode 100644 __init__.py delete mode 100644 __pycache__/scholar.cpython-36.pyc diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..bee8a64 --- /dev/null +++ b/.gitignore @@ -0,0 +1 @@ +__pycache__ diff --git a/__init__.py b/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/__pycache__/scholar.cpython-36.pyc b/__pycache__/scholar.cpython-36.pyc deleted file mode 100644 index 118340c84ff2876b769e05c16fe8c5b49f8bbca8..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 35395 zcmchA3v?V=T3%Pb)oRIYS(asw$5W&6OJ1w}9NXiuXY8?Mk7qry*V4RtX12H0Rgzk2 zb<4L(wpEdLm-VnrmK~VDh6LCo%aXIe0djIUoK14h!U=?Ifb7C1ECh;W;Q%2Sl5BuL z9zz28zW-KLKh&O?J#dg+x2ta5y7k|C|Np=L>)tvxFp!)4#!IC?_*b#m@5S2wcHsOR z4(G*qEM~<@G0U*xb)#g+Z@d(j-$W@PzsXWkep98?+HssE>gnZ7DYKj{WtaO(eFmRS z)^p4KrT*o7DKFQl`oQvFY0!wh5Nlja#_UZ?IrWc?82_Pkx?TvMW=%m2o4aZl(lxvk zYs6oStrhTGroLr)xHK%aPU%|TG*nX88TpY3bc&2OC zDRQ@2#`c(%TZ*I3-S%i{yVY;yUyGG?G_ux!HORBeF>8}mcr9iX><8{8N;`4A*&4$2 zki8Sv58`@@HH_1i-^BGcYZTX`a{Vx_w_7`Ky+f|YaQ%R_6W2TC z`Vm||XzjxFF1g-~>xV28*QUK2_3gp+!`2wC$K-l1t{<^><9fH0--qix)?QriwfCX? zeq8Uf_TzfLT#w^=+&X~k15*9~uHR=J#PvbBexEgA718d4)+y_Vb@Vl(G+`aHjw7dN zJ!+l6?;-oJbrL-mS2OHa`_R1@d)A6i$HpJ?empm`P;<=XrnOqP%@x(WRkLiztkx@z zgWtKPGHNKd zw;w&U&|J0;Emx`w6xfod-*c;I_;io2*YAWyHr|@cxoPwH_##)4qHcJ$z0Bp8gZd8`-a@o(7%NnzB zo-dbgu2$;7olLoGHLK-v8?@|wB_I#u?8}6PW0y4)ufHj9`!6-sa>dm^pE$2c$gHS& z8AuaE<=C!Enz~&PB12&pm{mh>f@Z3q8MZ`i#f#MllWj~!nFyGzL;&878~+YR2r#d| zDS(GQKq5aM0I1<Dqjwscdg-su{kM zcxvBSsn@DCGIVFfuGZ#iwl$st?XNk0M$?(&XKz=exsE?LSF5|W3a=IUADD4^xuTX< zSNt^R4iIf8D*JuQl_k6EU^KtRQOF^QrE}@5kq6`Owzds(F)B%Jn(jp`ejbDD)~b3e z)Na((_C9&D(9Z+7@f$dVM`evoFRdGar5s%uBtagO7;t;!BnMptQ-1Ov)GVV{!_?s?VyY~JoljUb{ z^U{^8lRbNWFIxBw91h2UJ)bn>Uox3gyw@=TDP)|(;haL!IRd0fu!Z!eAqv|mD{p13 zK4B1PT;;6(*Fc{b8IH&RVAdn@oLj3q-qG{sZB<#}s8HOwRZ+F(s$;IYcEiDa&{_qu z4CZ7|thoTGXRWA5hwMUeh~6Tv!o-u3Imk~E<^5!>G1v4{mOZ;V-^pV7QThT7=Kzwm zY$y{XF&#q&->qd~HxAZfe9X|YuT03m@`RtLHC#XUe5Jl>3!7Ht^&qPRW*0xP?98`y zGyBxfp&6^nF5|g0P*}u}7!W*b3>x0H_MRxVN_MbBcDulP9VZYPz!DeZ_hL&KHAJ|0 zy&l^R;bAekgi({c)M9!uVy!!>F$z5y*WR?zM#Akiw^?j3OIHeUZJBuP>if4+lxHZIo}M+%2dqiiE0E%p!X`Ups+5|+Wm%h@m6=7i8V%W zZMf1^_l$X{cxvlx1%8(j1kWPjaXr`!9h5bx1nQPJMx zNcAQHeMkd$Rs)B=>li!%sRuBYUDv>zBQWd~>X|)2i3;`xg+oVB$QQs7HKgy zzWKxpF{^Jey`B^eCN~vu;o8`cBzf@whbq4WiY=v5Tx>K8FSTwzb(Ud-Fq zh0GPF?A~#UXX?#rrS3d|6K%XeWkkCO1kwU~ML;0M&x5jF9Y!Cpcfh!}OAq{=_{}5- zOc*vsi)gI2AOY>gi-&sw_;&6LCo{-(D^I*X9I7EyJ$v?8j=5)#BaB8~ zu1@o<83-q$i}^{uL}<%T-=fgwKuD|H@zWGI8>MVmjRVY3%el->hhP~Hr69n!nukE zS&Gv@qOr~8qzkNbjn^YQB*tk8#7e&r=at5~bwgbOjwRRQ#4}>05eu)sEj9@PA3BP2VZ0>*-D zg`^iXn?Qr`sM>_nZH4EgX3(WfC@q8A&@E~EJvD18wCXBQ(Dsn!h{oaIu@*Ts#RtZCvnE~_W zO4fpzbUg0;a%3~PVbknFHPMDCOhjoaiWQUgC99$=GnA94Uc%5vdk>7qtmF`u0$b6xvjMI+|-vo{0L@`K^X#nKz**tTL2d@d<+lktI`3e<}n zHflNQ6(%2KQfBfYCd*8S#cGv_K+;MC*EurV2n+c{HkmZCBiU3okxgflNb|XV#S&E< z(O&{*=WsY5N7A8PJxB!uT9Q~1pe14Mw6ge3TTn3Yn+aS^p`)o!yP2S0iF>KkZ|AjR z33i47oQqQl=Yu#GhZ4>=X;)IIpj}9%&0@UV2%P# zgj7+nDl4K50temOb_LkG3`&H#F;oVd&>c*-2206YvtDoBhMI>|>v)|e5Yjyd$Du7v zlrWuwwX6)&#tI=UNkaks5;)I@Kn9P*&?~e1pFDl^@cNO*#$U1i*3rAK%)-Q$q`lD3 z1mvd;1@A-6xPd9vk1IHTIMdglFlDGKm6aU|b!#ZshF*eMM9pPl7TIsiNaJ7L@E+_n z7QMBUh;eW(3s}Z?M#M&YueN@IQ`~AU!O`ZDB|sLc&sM5S)UR#$eAvX@9*x;p`e4>imXLl+9e@t9#Gu) zMAWThWjut8h_GUZ)8s<^4{C!EyYqyo5JF1#45$SMA+*F6GwX2?(8M|iK`pk}N5xs~ z##xS6ynAyu6a#U;1+0jk3GFS`t_|w-wTg>QVW-kgguWMs1U>@mse2H7#jZ3s2=of0 zMAXYc362nbsPbE`AJZF^mtTZbk{L&)haT=ooE;lRSWUeGw7|U z@qRJ!Yp1|%- z8FdGbst+Rxv}!-O;4atwgjKuMjUFj^`$vfpWJ0lYK9NlfLyXV%P-d^o98r`RU*he6AtbVM7=753-g8(uqDf^%+Z{6+txCPybYO#+Z!1xQ=0p9PdM9-1*VkbNw_uKZ zgPH*J0OrAi4$`tRFoeIpb`-hLz!#JEV${3uCe{;+DasS~;&&5nntBrQNRBv@keN|9 z0kUr2TpW&ok84Ljn**SKN_`t(^5ZqDl%03VmD%#L1r;|*C!om8Gc?1w_8qsSk-lvl zR4$%^U0tv|Nvu{MLyoA)=uUX`Tt5RwS+Bst_$cpu6p5d^0?*+c?PWND3^=4kO-+Ch z{48QJ%JQa<@PV#j_!i`SKZjuiCoy9fIGm3U!|Yv1Kh8l?s z%ABHrCK{TXxxt8bpai8zad ze)TX0I=+r5Ex`5po1X&$B&-44|D3>;um%Aa&-*nB-2B`Nv73Y7yU_l|H%+N;M@#jX zr!d}_p*fNmWJTN*J_d#YSDt!5uDxUtMqNcOjh~zY3x@c!Qfc^nbgn+07janAP-dax z(26TB0MjCCLl&q427EuhaLhLdzmivvCp;Rx{~ zGNCQtEpHlkK1A$z(|GlH;0H!W#$m_JEXIir{e;6aVh2e&tB%VZxyCrbIH7)Bo=Rbq zT$D^AH;r5uBbJiuG)Q|Szz`@5f+27{;(lLPGmIfJ=UO?r3r7v`Ck=HXb5GdA-Nc20S&eK&ZdR;>2-3i878l$?Gl#_>YkHGeq>gIEewh ziPED$^@1@H-;v#w8i5fahw~w02P}5G;Dq!Z?kQEbQ(1)SoaN&`tARgU!66xS)(d>?=TOu=OCf#dG@#e6Lxg2)3IHP&s zlGZ#oNuCSPgbUvytONqwm#}t^_&wzP8R6qp6$2aVHYH!zB3mypvID|?JTBu(9=Rbr z0B$t?;{@@do+jvF(zCWq(uSXC&MtOC_rs{`?Wp%c_y5k`7^lf1LN+2=C$bqD+0@`V ztYHv?*l~tqS}=%ii$OpV8{sz2VF~ZBj(9l;C4iAo2zP_pYQnn;6oPOL;{)dwv*tAR zA)KsX7rjgn@bb2C2rc^|yj@XA4-|GEyM^I&7MZ{~ zZ*btGQNkr(J2_0AoVLOP)lNhhSC0|sZ1%#S| zioN*anNYQ#K{G5$3_f)ZSJNB&b%O~&{Rm3+v0t?bH4i%o z<8y|=(ZI3pBU^XyP`c5E?r#93ZlpfSzv4PiFh~(lw5V`p4_mPV~0U8bE;!gnAQ|c4I zOSlms4`+Hv!<3W6dDuf1rb1&V*4SGX{!;L+0Z36i&q3OaFGAXZY&;;8H3)U-XF(x@ z+p&^>=0Ma0SqG&8X*#H(fEosI4*6c_DwLEr2lB`FqXX2Zm~4=fMJ(wSlUr=Qq7|mA zeD?1n8Q;{BmRo{^@P8#z&H_t*j0stf`iD%`k!Y&{>{41p7b+wcZxE1`ysL|i5$Si3 z+=#d>`FN+X!=S)W5J`C#(%zGc#=l}9GJu>J*fRx=uobm+2NDZ`wcg*Np3kH3Rsuof zqFr4v=dYDYP98nsX=~13FO`Tv`tZp^M^79&a9tN zyPL2wRQMJpO7RUyZ75{~>H!GC>W@^;yol+BmB&OgC=h&6%Hmpn2hMQ$&Lu4q+@a{- zhguQ&^#-S;yr>eJXb8OdI}p|RR$T%rO8a}YjRE&0u-YoHZBQj%!8;0zn>+Ithd{U~ zcMGRBYx7bP_HS`QRFbh;J%y)H2zoRME?SS9qQmj%<14n z31pnZ;n1ktd%o_9m)tBKJw^lT#bX;idRffyLs68lERFD<3FdV8yCR2~#0zr`$`VX! zG=2w$|6sib%%%vSg#We~&h!CCBKGi)zi6H2NFz9D_k`6z;blyQfCUm%LZ&YD5fsw? z^`358^&;-Cvco9P!f};Kr=tK6X-98&6+2Pb;t)=>AAy#V2mm)E}}Xe9!tOj0lrkA1B6e6*7PpS^oJQH)m`Xbt653t(vh~T zo7L-y4y}rPEluR47I@@-OvaRY@Qpw>fdUIRVI7Vptm|5xU9P!$sSDK=ZNZQ+cj>L8 zxZLG@rx8lFC{Nbl|S*IqrT2 zCJGuYUS}r}rQn(my~?_Sf&x=jwx55L8l!Q)fr~I`(K2A497C`etYs1{2!r<-aTd4` zf#D>VsfR`d9n3?02NMm-uOmX2OgJn*p7wq;>f-YjVx=nZ_S`n9`;%NEGeIa2GgzwQ zM$1wmt5_Vji6u2~ILiR&4lK~Pgk|V8 zj;$URu09oE4ZD$@MU#?*7~CfmNi;C>R^WF9KfS`VBogU+!@)tA+3`U3*Mn28SXMde zjWYYES5T4;Z%b;%{~TXMO;y=^w~mg{(MBl=9`gq*d#(b%TUi3nuyVk<(@kkxdc65f zf|)~|F&J5bxcHEP-51W-`+n#vDD4EZ)F+#?)GIcivoRDFJ<#IG6 zEE+ZCL%0x&4a81Zz_C7)HXIra;VU*Eh9Y14Eg+y`m4`U35C|xxlj>;;lujE4XdV;~ z3kU8%yc~nbh4mlQ%cve#J^+3jk^YR>eJa7U81PYgfDxoZ&meg2a%BSXToncF7^{jG z6y{iZAwktznI@BlGoxy^6$`nAiwKH_a}d>Il`k0W9OGDmx+55FFVGIn8j9v6;F^7> zLPig>N@Hc!%|+!*bDzVtJ_ygUQ0H8;c@Q(c+Se*EYXe@}X=UJ%vDPzVe5UIWt>o}2 zyiTtnGObk_#KjK4Ij4z86|5v6_k$CT&`Wp07v9F8-oW4q-oI^_ZyRGEdXXi?-t)J4 zuVv~9e05m*j47#<3Az-k?xb(KXvE)2l@a!Z1>324b~z~hLqs%+1hGM`kzw2%eRp|M z&{!u0^?ICgOMg$*lVXbcDJDP7a$p-{+$V=*kF(aUN2tQ7Eh zz~2%q?Pzbc_p)G(3~sP$!d#>bv;qq!Z9Q$Ca?Cb*X~^w`CQP1JF47Nan>%y`Q;zX~^B7K5kVK&FA znZ&&WjmMDeI8!IN#w=Kem7EgklB?)t3BTYOFpS+Qa-|Ft;;3{({C7?(eYXew{#1zk;he{PgbIf6W-MIs=>)vo1Ja2hS2>S<&$B4F@g z2Xc2sVV)rS2!qptH$^gf06K=EMPS~JR&KXAOBuW+&$6Y+cmY4U)u(vY)%1UhykBP1 zg;~Rp;yu)>*=-#e2_JkS8cW)mIOT|sa>x}P{V|Lt^J>Oj)>6=n6lVkqt=#it>}fT&rQ!@p(teIFsOiQ zlEj<#5}#RN($$+}6~D&bBw7)X;5~*$I-J@W_a5lguYt%*Hq>*4Yb#DreTnZAuEa6Z zT&eYXITK|5dEEV%gn{xAOk;_mI84hK#r!Ic=#N-?4u|tkk-QV{q4aE}ab$30arEKH z;poSa$1#9o5XYu%3`Q+jnzp-eZO3oFwZnPWHm? z>@?bNHpjwiEI^Y<4x%}hJy#l12_|SA-Av`N-4uS?o@K};r)9~nShu+8gcVYB6CtQr zZ?}<04#H`56AV)3ZLr6oy~wavn5Abks{#loD8vU~O%z3^znJE-bLyyF#Iyz@LD zvq4f)@CVEcX>bJzxkZ|loIb3-jLnnZ=ByObeo6CUoJi|=L;V6?By+~483cTH%^E2g z7C$FH((mCpJ6nAb&tgJhCztiI#p4`xsL^M{M#pVT+9Vss#~6x82_yX0*Dl zr;04p{pTdzQM@dihZxx#{>yP*dSD~oZ=9A;OH0QgCkPgeQS^;_M`Z?*mI8&bcKiUm z0SBV#WUdKSHbif2Fv7+ehWcgH6RI&h?_z&`AEybawGYfiEBSG6XAgLzf{_OB&Hz8& z1~9}=NXnQx4*+l&9jT8u>>kj!LH?C@fqXCmNJf634g81S34AI29@}uTXB&uIJzyup zX@j3Tr@T|ox53W}H@|22S^7P~|ER#PIhDqm(3;i*U#PqUd&C$*J0bL>qi(#(vG;Yw zTiYOi@IF1SOTI^_Te_Km|5UTdcwZp5#FJ3H2_r}#i8)#(1INPV6EKD8r7}>(gyJLl zjVoeW5*aDu-?zrc&q?7j3w1YC2t;_u_Nbky`x1y0e2>jMF0EsbC8shFMmRRXfain`=`1X@!BQN3+eBHz*U-J*f0 zHG*jZ_gCBC=6N^V8NsR5+1+hG@3;@pc>7cp<>ZgQWJVWo$o3alH#n#TdiZl7s;ZrG z1v7$8H z@fd~=k+I^N*Sp)SX=P5L#<*n@th5LDJBx)B?Fibg7W^vO6{rP_QwEaz2|U-`e*^5l z2>ku*MNT6zMk+wfhE>PBPj5yKy(<<3jsrve2f`Gr^1$L3EKJ4bJ}u#va;?9W0Bx8_ zj}b5qSIqFRevB$ryFe&7x#6Mr4%n}?08_t*^KPKFv;!5)_b1vw+WRgb1&>5>5$%6@ z92amn&p=9wbgB>GOh-KDYWl-H(XVy>vf|#<&9-V-gPC z0e6rtU1*Gn_2e1E%-&3j&bSF}>VuQ_l*N@Hn*yy7PHNE_3+oBd8uNOy1+h?jV~*fT zVDICC0PZjdrr%NtwAT0xMglw6G}e6Mux}iJwMZl%3KWX7vK9mw zX3VHZ_M;})ySqqEq=fDVJJ6M5cL9f>51JTB zE;0-~QO+Oez{}SN2eBT4O1Mfl>hX?}Q@t0Xm+~PlOkJCr^b6tDjmb+dT)Xl9=@zaa z=h^d9*xW9_oyn<-Q`crD)gR&Inw$PH&;AFKKV{NF9XT6h-hV}65$0i>Fg`W<7pFr5 z21S;r;)woe&^d?0`5cmV?%3|kD5d2J&bjwuRxD+(nXv2{ zgY?~5aN3)dP+^3U)wz0I>fkaChG;n4^B4Yqh!)`z)Rt;P4YYkhs@|qdo)7d-?!HQU z4732exy~qEx)1~7qt;PDO*WSnV~MJ$cJR@)H{*Pu$wtixjFUs51BaU>$vpC;x814M zYI6%`k7%o&SQBB&JACj_ucJ5uz2}@Kz4*#o5DNe^|{C zRMQt+tek}bQ7rV@P2>$qJwYv~qfgg2htuN$H1$ej{w$1Dds+LmT-5uD8JzpTH>7bGJofW{dvmJ3sz>VT{Ss&5C z3|Z|396~3;3^4(~o5d{w+Crlk6ehIiXJ~pxWg+waKfKeTKp{TPRzH|Pr6egb>u8Zc zfRF(s?=oXikRr39c#LovoQVp-bH;P>%0P4gnHjcA3gVvxo_~!eLjWb9s|&!7;{Fe{ z07S6w{x-F%)m87bI-ETh^5c2cu6k7Kd=%#f|Qc`!Mo;lyI?P zaN3a5cXeCm`s*G$vvL5`N13q3wzotK(obt-_$=Q>p#tJBBuDk;i&>dD4fXLgj#gr^kJogHzQMhG5tL3m<-Wd%s6CB! zxOuF>ea(OhXT-c+avy7bMFwH{p#B$PFJX+S=zZ9DQMSPsMGEzcDite3eF7W>!Gp{c z*XIuE-F3@W&8cF07ZDG9|N10(M63Bu* zGwI;RO70w1a$^||YgG*w*g*|~qt9osbSF(sf_uDtI&N)lEBk#x*`ZchYYU>W4Q!#n zSmcpULz%`E_T9vCpbGowT5VyyoE0A zhrDNH(lPOOeg#XmF;ig2tRl;zsPzH*o*fzV>jJLyDU0z|pNmLIdS(z}s z9g6d+!)dGSb_G~z!M^o=42_?`_NZlSIE$?zmEy@It~yrB>JzwIDj>!>tXJe*V9vMW zhJUk%hR+mv~5`N9qfQ26B4gT^rE`>1w7F4P_VfizrhhxA7|m)OumcBmzfCHm4sC6 z4@wiFvHKYk^b@!dOG2rMr?C?^Bg3)5T#DXzS|yT*4%feF2(sMTmgGJd&xa*(7K!B{ z$#=dbn)UIv7U7@41;_*hAI$lh)D6a->S&gL*pndQA{S<%w8Wr+jBz!d^1dY^8qxWK zP1##JmsCP26WESm-2D;h5YAR8Z)>Bhcb7Y5@LGiCJrH7yenmLEn$AW;g)P|C3p;^p zA_%BVr8sQdxEafM{6Sd<5YE+(X7>e_5QEI?=&vv+MP>pS?ym8L&!J)AQwV6@pK)v{ z9B@n4G)65L*|@j8OWirISU&0?KXj3sxdr7YgBM7srANY`LzC42{e>C@t(b z7)3t$Oz>p;DCWH1jz;maY^V zW%Z%?21c8RLSf5sZ3Jj{>KzhJRCVzYiMBo40=}!?!!5sQ3xIj(pm|^g8;Pp}d@6vE z%#`m5rt21t$g=gROEa1;KGaq`YE!VC8@Jl^MxtC^(v>MV=ixz&t7h7rMec)B2=96x zBiF`A1erdBybzD>;M`C0J>86AGdWiH8rk_Y1&La_*V9}k3ReawzlBxe*3g?#lDa6! zZm6OIy(r;{1TcydG6H^OfwMb9%q4J5`-Lt=L)uQ{7^dd!j=hPZIkH11j2@;yObJs~ zqK54k5_*KWopdCHcCf*sMkLX=_JDgfj#JWo{J%6dOM|2tIOFnj-K4b10;so-u?$gs3xa0^XNGxv=iIK<~ z%9&UT0dv0IUJjCnjjg;_ohV+FK3l?lA8R2#4i(!&XgRzwdOJIDqLar(;IcG%l_10% zPN)xx_=bgX(=ie15sxgn5adOur+^>u9P$+i^=Sz89}0x}VNn3UfkL64^;q!EgrdB- z9s9hc2tx$Dr0qJUsSeKrA|yhg{kFXiW`tF-LxAcb;4d+PSb;(j{Y0xm%VEw(`572( zOOD$2wII_H{?%WgK1ewz*GB4K#1U5Y5a;pj z43vlQmmgWe{w^3WTB~9ydMLF-Pa>yGkEof>A#8uLwc;d;rz3@p+ktb133z_qLd+tT zJ@S0iO59DX{k)7KwqnHIIiL!deW`MB&Pa32AiifrB*&6M?BRp8^s@FIUmCzdL1G1# zT*?!2P8WgIKODoVeAc4#@>r{nZx94E>%0NvVb7C}yg?U2Iz44vuiy6kCM-Q1u(10| zNABkKH?2dF0LBe2<={OV0nXtoYTl@B8CpBEo*4;XBAx;7G9S)p6;kkSi}!&DD}s-P zXz2wjlQ4rjkRN8Q%gja;m{GS-`mE-TNio7piUyAhYy4? zs94VXf(K6LyfojBU;UjaFL?%kUQ*C^QyZqHQJ4Na*Iu3-luO``pQL)~CmBWUr!kk{ zb+QzS-=vcLpQRCg+Cs=aas=bE+J!;bTf!ChSH9^d7Oc96$Z*=L4VwvbYJ7}jK+AkA z$t{+7#EoId!SK*wuG3z;QLNP-jwj&+817uA#z|qLoDcKRyF2SP^D}j!%PQ-x2=4m%B?h26!v2Rj}6vX&*Y+}w$ z)Uf>-IrD4S7elWiGgzAlj)ku*WZGQf#7r+r@4+28{6*nu=DV7%BG1t?mXY_7{8?vou z$QjTAMC4NNxcRjgv}tcZyPl$13jHmU>NI{O;(Z;I%})}%*FDo;C5GYQn78$BXovK7 zUbOZ>!o#^2i`KSvL9}7B?7wpBD`D%S4%M#?R^idGB&+bgnH}KYxqa2P_6@vz`BiM^ z9&YM>7;n?Q*3&pcaKg*X5Q(hHGm8nW53&MJuf?O8W&{`C3a#d_7H@=ys=Z1 z@iflWJ`9E4eL0G~BGE=RwG+6Y0E$qM141xkvR8mC7qAQvD5hSJqXdLNnZ~Y%H}e6? zCnzW1#Pdoaxdcm!Kux4od&qBwa0aV`*t(-~t)}^DJsE6hB6iTc%a=Ym_U=NU7eA5OdN`Aq#RT3)!ByY4W4_`3F zmeVvfw#wnFrTkJU9d^-MuqFWO0q`x-wsKe(fShsUz=9-iJs>&nYs)z(HBF$VuuM^M z4z=YR=J!ca)3LU?kMkR)$UAAB5*V;)gkDb=^vYw_>8Mv8M~!E!vu`8P2x?L0;s#+bea1+N$Z}ruAtmAZLNF1b=7*-nzF97_0)Ci1E}A$ zZUlKzeZao-y;u-ag}DP#reRIX_g6=853>l!X>)g-A#)HEt%rZ5T2^LTdWt5JB(b+NY<3pxfOTTQDUpRsXT(; zZ4$i|S6{>V=$mopEBM_W{O$;T9|(ST2EPvmzq^9phk{=db$<!4Dhvgb`a;GGBb_RF$%AKon=ViHbBDk|p?i`gnugIP2!JYkb zXH@PmA}yg70L!b)mHtdGj<08Ug%aCv_jUIG&#Vt(d)92@tM2;%C8PlKLPC9ZJ-c|2 z=Rx`H$oo9`Ogj5jr1HZ?S{|xdUwanm&6Ucc5IVYo> zQ!>6u^_t+&Dd59nf6mo+o@o5SmrEpKF=JhY<-A& zb7MDh9|>yylKVtZcF=tio2(=k&si0g!r4|l`G?Si&I?up1IK5lD%&w-J6a#ahiT21 zgC#q!nrFo580@3)MD9FvR;>GNSokCumvsEFI;V5FN4P9p ze*ppU^`T7y-|)j`4fr4?b_bZG|JsJSjnDyn^R?k(+X~p-@I}zmxi-iqs^$@MVj_sY z9;?*h<9AIwCl8O|RuGRr)>+9|2MpL7u{Jwnzliu?7Zzix`w}DG`3!V1#K81;{RQ*L z$&<$)J9_xoiQ}gZA3b^E$gyKb3064kn6p@&+}7qfSU|%@V;_kgt=5}1A9?KLVTRws zB0-y@4zhnF+Qb~4FKeXbre3TPHgin0$uZqyW8E-6YED$l9-ZX^W-fRz!)k&qXFs+e zdg~}A$G0pb*td8yKXkKz4`PFHC1KdtLE^VNVo#&O(j*wahn4&i68 z|9)qj(Q(kaE7+;S&ji~kV8>m6j0((-BHbN0Gq53MF10!zt=@y!6Bb*QR;j+qRjQN=vS1|*Pp&|etOb-xM%IM1SWtC54v7{ zVnT3yf@?##*()rRW$gc9U-CBVNLft-U<&m$jB6T8Xke$8{ttWpI!6=>7E1Z+&pmbZ z%7ybY`2Oj%SL}d5e+D^NIiN#@xEoAs>7SPgFdm-{oIHQSJMzwtV<0qr=nwjG9Rqto zZ

Jebn}*dLf4vcN@+&Lo}ckP*#!g`AwP(VZeYM(?{6EO4?lALY!HFUDFIwy zY}Oq!_v@*0{Gd5zH*h)Gu;``ZwOYpY4SVj{N&}Z@)|k4jb6((wlE#arfoBmg1Fsx{ zrQvPx(9ha7JwcPVpX4Woz1J?`oB{+fjVw+V5q|}y6U1!KX{0--cVMFB5Nxi^#Sz-_ zv5QC$8R)XL%3WO{L&QR4THE!A(Ipa<*CB(j%yMqAh{f{hU+mo`$PE05dDD|R7#w!^ zS#AS?2#R@e3SxDfe9V?x-T*ThgBVV0AX`9*!ub3Nx}SY16`R;90?5v{-SYY-CW0L( z5O}?S@bPdziZ+j=Hf7LC8@Lv2p*rxHTPKFhdogUkOtPV!8%WasW z5k{j)gdIhAN`4W|p~r-F2K?+rH;+H%XVH1$ee<_oL2y1KON%D-OX8e`EaB&`iO_Ie zLJs|YeRWy!D?&N?%5>-Zh1Pj_Uf-2!;ep4#-(n8hg%`^{;J5sokeCQGVhmS*!oHRu zjmLQ=#_MsOJ;jGI!5C?y32hmrevM6KiT;{k$!ko0g~@R=4L%%*1^C_rx<=NxG5I=^UuW_gOnfH4 z$>g_~MC(CHxkfWwhb_xON_k|2Z*$b|v-D$Rj0br3Q%uBCO^Z$`2hN1P;HDVk%oD+c`;^ZAWn z-WIA*v@R-I#_;7RL^9zM_4FMsA`VeP68rTY{gYfIBi|s#lSlouuEOurrIDv-=2f(! z32FFA_@7-rACL={DT1+Y;oW{Fv^I)oN6|g07jf*s8DbY_7ZJ{_gQkS`gcNC+lWm`6 zS6l5)`2zD=_|M>bO+YlGiUCriC_7QFs%oK_oDK4Y*)z*cYqf4a!7gy9<0L;4dnm$` zcs$A7Gz%C}{thMZH5B|~N*o*kt{`S2p3M}J@^5RQP}p9$Uie7tP+>>mslv6wzQWzY UZd_l>ZqH?~=g~)EV>#ph15L*k761SM diff --git a/scholar.py b/scholar.py index a7bd414..f4f98a4 100755 --- a/scholar.py +++ b/scholar.py @@ -239,7 +239,7 @@ class ScholarConf(object): """Helper class for global settings.""" VERSION = '2.10' - LOG_LEVEL = 3 + LOG_LEVEL = 2 MAX_PAGE_RESULTS = 10 # Current default for per-page results SCHOLAR_SITE = 'http://scholar.google.com' @@ -336,6 +336,9 @@ def as_txt(self): res.append(fmt % (item[1], item[0])) return '\n'.join(res) + def as_json(self): + return { key: self.attrs[key][0] for key in self.attrs.keys() } + def as_csv(self, header=False, sep='|'): # Get keys sorted in specified order: keys = [pair[0] for pair in \ @@ -1132,6 +1135,9 @@ def txt(querier, with_globals): for art in articles: print(encode(art.as_txt()) + '\n') +def json(querier): + return [art.as_json() for art in querier.articles] + def csv(querier, header=False, sep='|'): articles = querier.articles for art in articles: From c3f182c7103da4158a04ab95f102226f2d386d5b Mon Sep 17 00:00:00 2001 From: machaerus Date: Sun, 14 Jan 2018 14:39:42 +0100 Subject: [PATCH 3/3] authors now included in the results --- scholar.py | 2077 +++++++++++++++++++++++++--------------------------- 1 file changed, 1014 insertions(+), 1063 deletions(-) diff --git a/scholar.py b/scholar.py index f4f98a4..646d491 100755 --- a/scholar.py +++ b/scholar.py @@ -168,991 +168,942 @@ import warnings try: - # Try importing for Python 3 - # pylint: disable-msg=F0401 - # pylint: disable-msg=E0611 - from urllib.request import HTTPCookieProcessor, Request, build_opener - from urllib.parse import quote, unquote - from http.cookiejar import MozillaCookieJar + # Try importing for Python 3 + # pylint: disable-msg=F0401 + # pylint: disable-msg=E0611 + from urllib.request import HTTPCookieProcessor, Request, build_opener + from urllib.parse import quote, unquote + from http.cookiejar import MozillaCookieJar except ImportError: - # Fallback for Python 2 - from urllib2 import Request, build_opener, HTTPCookieProcessor - from urllib import quote, unquote - from cookielib import MozillaCookieJar + # Fallback for Python 2 + from urllib2 import Request, build_opener, HTTPCookieProcessor + from urllib import quote, unquote + from cookielib import MozillaCookieJar # Import BeautifulSoup -- try 4 first, fall back to older try: - from bs4 import BeautifulSoup + from bs4 import BeautifulSoup except ImportError: - try: - from BeautifulSoup import BeautifulSoup - except ImportError: - print('We need BeautifulSoup, sorry...') - sys.exit(1) + try: + from BeautifulSoup import BeautifulSoup + except ImportError: + print('We need BeautifulSoup, sorry...') + sys.exit(1) # Support unicode in both Python 2 and 3. In Python 3, unicode is str. if sys.version_info[0] == 3: - unicode = str # pylint: disable-msg=W0622 - encode = lambda s: unicode(s) # pylint: disable-msg=C0103 + unicode = str # pylint: disable-msg=W0622 + encode = lambda s: unicode(s) # pylint: disable-msg=C0103 else: - def encode(s): - if isinstance(s, basestring): - return s.encode('utf-8') # pylint: disable-msg=C0103 - else: - return str(s) + def encode(s): + if isinstance(s, basestring): + return s.encode('utf-8') # pylint: disable-msg=C0103 + else: + return str(s) class Error(Exception): - """Base class for any Scholar error.""" + """Base class for any Scholar error.""" class FormatError(Error): - """A query argument or setting was formatted incorrectly.""" + """A query argument or setting was formatted incorrectly.""" class QueryArgumentError(Error): - """A query did not have a suitable set of arguments.""" + """A query did not have a suitable set of arguments.""" class SoupKitchen(object): - """Factory for creating BeautifulSoup instances.""" - - @staticmethod - def make_soup(markup, parser=None): - """Factory method returning a BeautifulSoup instance. The created - instance will use a parser of the given name, if supported by - the underlying BeautifulSoup instance. - """ - if 'bs4' in sys.modules: - # We support parser specification. If the caller didn't - # specify one, leave it to BeautifulSoup to pick the most - # suitable one, but suppress the user warning that asks to - # select the most suitable parser ... which BS then - # selects anyway. - if parser is None: - warnings.filterwarnings('ignore', 'No parser was explicitly specified') - return BeautifulSoup(markup, parser) - - return BeautifulSoup(markup) + """Factory for creating BeautifulSoup instances.""" + + @staticmethod + def make_soup(markup, parser=None): + """Factory method returning a BeautifulSoup instance. The created + instance will use a parser of the given name, if supported by + the underlying BeautifulSoup instance. + """ + if 'bs4' in sys.modules: + # We support parser specification. If the caller didn't + # specify one, leave it to BeautifulSoup to pick the most + # suitable one, but suppress the user warning that asks to + # select the most suitable parser ... which BS then + # selects anyway. + if parser is None: + warnings.filterwarnings('ignore', 'No parser was explicitly specified') + return BeautifulSoup(markup, parser) + + return BeautifulSoup(markup) class ScholarConf(object): - """Helper class for global settings.""" + """Helper class for global settings.""" - VERSION = '2.10' - LOG_LEVEL = 2 - MAX_PAGE_RESULTS = 10 # Current default for per-page results - SCHOLAR_SITE = 'http://scholar.google.com' + VERSION = '2.10' + LOG_LEVEL = 2 + MAX_PAGE_RESULTS = 10 # Current default for per-page results + SCHOLAR_SITE = 'http://scholar.google.com' - # USER_AGENT = 'Mozilla/5.0 (X11; U; FreeBSD i386; en-US; rv:1.9.2.9) Gecko/20100913 Firefox/3.6.9' - # Let's update at this point (3/14): - USER_AGENT = 'Mozilla/5.0 (X11; Linux x86_64; rv:27.0) Gecko/20100101 Firefox/27.0' + # USER_AGENT = 'Mozilla/5.0 (X11; U; FreeBSD i386; en-US; rv:1.9.2.9) Gecko/20100913 Firefox/3.6.9' + # Let's update at this point (3/14): + USER_AGENT = 'Mozilla/5.0 (X11; Linux x86_64; rv:27.0) Gecko/20100101 Firefox/27.0' - # If set, we will use this file to read/save cookies to enable - # cookie use across sessions. - COOKIE_JAR_FILE = None + # If set, we will use this file to read/save cookies to enable + # cookie use across sessions. + COOKIE_JAR_FILE = None class ScholarUtils(object): - """A wrapper for various utensils that come in handy.""" - - LOG_LEVELS = {'error': 1, - 'warn': 2, - 'info': 3, - 'debug': 4} - - @staticmethod - def ensure_int(arg, msg=None): - try: - return int(arg) - except ValueError: - raise FormatError(msg) - - @staticmethod - def log(level, msg): - if level not in ScholarUtils.LOG_LEVELS.keys(): - return - if ScholarUtils.LOG_LEVELS[level] > ScholarConf.LOG_LEVEL: - return - sys.stderr.write('[%5s] %s' % (level.upper(), msg + '\n')) - sys.stderr.flush() + """A wrapper for various utensils that come in handy.""" + + LOG_LEVELS = {'error': 1, + 'warn': 2, + 'info': 3, + 'debug': 4} + + @staticmethod + def ensure_int(arg, msg=None): + try: + return int(arg) + except ValueError: + raise FormatError(msg) + + @staticmethod + def log(level, msg): + if level not in ScholarUtils.LOG_LEVELS.keys(): + return + if ScholarUtils.LOG_LEVELS[level] > ScholarConf.LOG_LEVEL: + return + sys.stderr.write('[%5s] %s' % (level.upper(), msg + '\n')) + sys.stderr.flush() class ScholarArticle(object): - """ - A class representing articles listed on Google Scholar. The class - provides basic dictionary-like behavior. - """ - def __init__(self): - # The triplets for each keyword correspond to (1) the actual - # value, (2) a user-suitable label for the item, and (3) an - # ordering index: - self.attrs = { - 'title': [None, 'Title', 0], - 'url': [None, 'URL', 1], - 'year': [None, 'Year', 2], - 'num_citations': [0, 'Citations', 3], - 'num_versions': [0, 'Versions', 4], - 'cluster_id': [None, 'Cluster ID', 5], - 'url_pdf': [None, 'PDF link', 6], - 'url_citations': [None, 'Citations list', 7], - 'url_versions': [None, 'Versions list', 8], - 'url_citation': [None, 'Citation link', 9], - 'excerpt': [None, 'Excerpt', 10], - } - - # The citation data in one of the standard export formats, - # e.g. BibTeX. - self.citation_data = None - - def __getitem__(self, key): - if key in self.attrs: - return self.attrs[key][0] - return None - - def __len__(self): - return len(self.attrs) - - def __setitem__(self, key, item): - if key in self.attrs: - self.attrs[key][0] = item - else: - self.attrs[key] = [item, key, len(self.attrs)] - - def __delitem__(self, key): - if key in self.attrs: - del self.attrs[key] - - def set_citation_data(self, citation_data): - self.citation_data = citation_data - - def as_txt(self): - # Get items sorted in specified order: - items = sorted(list(self.attrs.values()), key=lambda item: item[2]) - # Find largest label length: - max_label_len = max([len(str(item[1])) for item in items]) - fmt = '%%%ds %%s' % max_label_len - res = [] - for item in items: - if item[0] is not None: - res.append(fmt % (item[1], item[0])) - return '\n'.join(res) - - def as_json(self): - return { key: self.attrs[key][0] for key in self.attrs.keys() } - - def as_csv(self, header=False, sep='|'): - # Get keys sorted in specified order: - keys = [pair[0] for pair in \ - sorted([(key, val[2]) for key, val in list(self.attrs.items())], - key=lambda pair: pair[1])] - res = [] - if header: - res.append(sep.join(keys)) - res.append(sep.join([unicode(self.attrs[key][0]) for key in keys])) - return '\n'.join(res) - - def as_citation(self): - """ - Reports the article in a standard citation format. This works only - if you have configured the querier to retrieve a particular - citation export format. (See ScholarSettings.) - """ - return self.citation_data.decode("utf-8", "strict") or '' + """ + A class representing articles listed on Google Scholar. The class + provides basic dictionary-like behavior. + """ + def __init__(self): + # The triplets for each keyword correspond to (1) the actual + # value, (2) a user-suitable label for the item, and (3) an + # ordering index: + self.attrs = { + 'authors': [None, 'Authors', 0], + 'title': [None, 'Title', 1], + 'url': [None, 'URL', 2], + 'year': [None, 'Year', 3], + 'num_citations': [0, 'Citations', 4], + 'num_versions': [0, 'Versions', 5], + 'cluster_id': [None, 'Cluster ID', 6], + 'url_pdf': [None, 'PDF link', 7], + 'url_citations': [None, 'Citations list', 8], + 'url_versions': [None, 'Versions list', 9], + 'url_citation': [None, 'Citation link', 10], + 'excerpt': [None, 'Excerpt', 11] + } + + # The citation data in one of the standard export formats, + # e.g. BibTeX. + self.citation_data = None + + def __getitem__(self, key): + if key in self.attrs: + return self.attrs[key][0] + return None + + def __len__(self): + return len(self.attrs) + + def __setitem__(self, key, item): + if key in self.attrs: + self.attrs[key][0] = item + else: + self.attrs[key] = [item, key, len(self.attrs)] + + def __delitem__(self, key): + if key in self.attrs: + del self.attrs[key] + + def set_citation_data(self, citation_data): + self.citation_data = citation_data + + def as_txt(self): + # Get items sorted in specified order: + items = sorted(list(self.attrs.values()), key=lambda item: item[2]) + # Find largest label length: + max_label_len = max([len(str(item[1])) for item in items]) + fmt = '%%%ds %%s' % max_label_len + res = [] + for item in items: + if item[0] is not None: + res.append(fmt % (item[1], item[0])) + return '\n'.join(res) + + def as_json(self): + return { key: self.attrs[key][0] for key in self.attrs.keys() } + + def as_csv(self, header=False, sep='|'): + # Get keys sorted in specified order: + keys = [pair[0] for pair in \ + sorted([(key, val[2]) for key, val in list(self.attrs.items())], + key=lambda pair: pair[1])] + res = [] + if header: + res.append(sep.join(keys)) + res.append(sep.join([unicode(self.attrs[key][0]) for key in keys])) + return '\n'.join(res) + + def as_citation(self): + """ + Reports the article in a standard citation format. This works only + if you have configured the querier to retrieve a particular + citation export format. (See ScholarSettings.) + """ + return self.citation_data.decode("utf-8", "strict") or '' class ScholarArticleParser(object): - """ - ScholarArticleParser can parse HTML document strings obtained from - Google Scholar. This is a base class; concrete implementations - adapting to tweaks made by Google over time follow below. - """ - def __init__(self, site=None): - self.soup = None - self.article = None - self.site = site or ScholarConf.SCHOLAR_SITE - self.year_re = re.compile(r'\b(?:20|19)\d{2}\b') - - def handle_article(self, art): - """ - The parser invokes this callback on each article parsed - successfully. In this base class, the callback does nothing. - """ - - def handle_num_results(self, num_results): - """ - The parser invokes this callback if it determines the overall - number of results, as reported on the parsed results page. The - base class implementation does nothing. - """ - - def parse(self, html): - """ - This method initiates parsing of HTML content, cleans resulting - content as needed, and notifies the parser instance of - resulting instances via the handle_article callback. - """ - self.soup = SoupKitchen.make_soup(html) - - # This parses any global, non-itemized attributes from the page. - self._parse_globals() - - # Now parse out listed articles: - for div in self.soup.findAll(ScholarArticleParser._tag_results_checker): - self._parse_article(div) - self._clean_article() - if self.article['title']: - self.handle_article(self.article) - - def _clean_article(self): - """ - This gets invoked after we have parsed an article, to do any - needed cleanup/polishing before we hand off the resulting - article. - """ - if self.article['title']: - self.article['title'] = self.article['title'].strip() - - def _parse_globals(self): - tag = self.soup.find(name='div', attrs={'id': 'gs_ab_md'}) - if tag is not None: - raw_text = tag.findAll(text=True) - # raw text is a list because the body contains etc - if raw_text is not None and len(raw_text) > 0: - try: - num_results = raw_text[0].split()[1] - # num_results may now contain commas to separate - # thousands, strip: - num_results = num_results.replace(',', '') - num_results = int(num_results) - self.handle_num_results(num_results) - except (IndexError, ValueError): - pass - - def _parse_article(self, div): - self.article = ScholarArticle() - - for tag in div: - if not hasattr(tag, 'name'): - continue - - if tag.name == 'div' and self._tag_has_class(tag, 'gs_rt') and \ - tag.h3 and tag.h3.a: - self.article['title'] = ''.join(tag.h3.a.findAll(text=True)) - self.article['url'] = self._path2url(tag.h3.a['href']) - if self.article['url'].endswith('.pdf'): - self.article['url_pdf'] = self.article['url'] - - if tag.name == 'font': - for tag2 in tag: - if not hasattr(tag2, 'name'): - continue - if tag2.name == 'span' and \ - self._tag_has_class(tag2, 'gs_fl'): - self._parse_links(tag2) - - def _parse_links(self, span): - for tag in span: - if not hasattr(tag, 'name'): - continue - if tag.name != 'a' or tag.get('href') is None: - continue - - if tag.get('href').startswith('/scholar?cites'): - if hasattr(tag, 'string') and tag.string.startswith('Cited by'): - self.article['num_citations'] = \ - self._as_int(tag.string.split()[-1]) - - # Weird Google Scholar behavior here: if the original - # search query came with a number-of-results limit, - # then this limit gets propagated to the URLs embedded - # in the results page as well. Same applies to - # versions URL in next if-block. - self.article['url_citations'] = \ - self._strip_url_arg('num', self._path2url(tag.get('href'))) - - # We can also extract the cluster ID from the versions - # URL. Note that we know that the string contains "?", - # from the above if-statement. - args = self.article['url_citations'].split('?', 1)[1] - for arg in args.split('&'): - if arg.startswith('cites='): - self.article['cluster_id'] = arg[6:] - - if tag.get('href').startswith('/scholar?cluster'): - if hasattr(tag, 'string') and tag.string.startswith('All '): - self.article['num_versions'] = \ - self._as_int(tag.string.split()[1]) - self.article['url_versions'] = \ - self._strip_url_arg('num', self._path2url(tag.get('href'))) - - if tag.getText().startswith('Import'): - self.article['url_citation'] = self._path2url(tag.get('href')) - - - @staticmethod - def _tag_has_class(tag, klass): - """ - This predicate function checks whether a BeatifulSoup Tag instance - has a class attribute. - """ - res = tag.get('class') or [] - if type(res) != list: - # BeautifulSoup 3 can return e.g. 'gs_md_wp gs_ttss', - # so split -- conveniently produces a list in any case - res = res.split() - return klass in res - - @staticmethod - def _tag_results_checker(tag): - return tag.name == 'div' \ - and ScholarArticleParser._tag_has_class(tag, 'gs_r') - - @staticmethod - def _as_int(obj): - try: - return int(obj) - except ValueError: - return None - - def _path2url(self, path): - """Helper, returns full URL in case path isn't one.""" - if path.startswith('http://') or path.startswith('https://'): - return path - if not path.startswith('/'): - path = '/' + path - return self.site + path - - def _strip_url_arg(self, arg, url): - """Helper, removes a URL-encoded argument, if present.""" - parts = url.split('?', 1) - if len(parts) != 2: - return url - res = [] - for part in parts[1].split('&'): - if not part.startswith(arg + '='): - res.append(part) - return parts[0] + '?' + '&'.join(res) - - -class ScholarArticleParser120201(ScholarArticleParser): - """ - This class reflects update to the Scholar results page layout that - Google recently. - """ - def _parse_article(self, div): - self.article = ScholarArticle() - - for tag in div: - if not hasattr(tag, 'name'): - continue - - if tag.name == 'h3' and self._tag_has_class(tag, 'gs_rt') and tag.a: - self.article['title'] = ''.join(tag.a.findAll(text=True)) - self.article['url'] = self._path2url(tag.a['href']) - if self.article['url'].endswith('.pdf'): - self.article['url_pdf'] = self.article['url'] - - if tag.name == 'div' and self._tag_has_class(tag, 'gs_a'): - year = self.year_re.findall(tag.text) - self.article['year'] = year[0] if len(year) > 0 else None - - if tag.name == 'div' and self._tag_has_class(tag, 'gs_fl'): - self._parse_links(tag) - - -class ScholarArticleParser120726(ScholarArticleParser): - """ - This class reflects update to the Scholar results page layout that - Google made 07/26/12. - """ - def _parse_article(self, div): - self.article = ScholarArticle() - - for tag in div: - if not hasattr(tag, 'name'): - continue - if str(tag).lower().find('.pdf'): - if tag.find('div', {'class': 'gs_ttss'}): - self._parse_links(tag.find('div', {'class': 'gs_ttss'})) - - if tag.name == 'div' and self._tag_has_class(tag, 'gs_ri'): - # There are (at least) two formats here. In the first - # one, we have a link, e.g.: - # - #

- # - # Honeycomb: creating intrusion detection signatures using - # honeypots - # - #

- # - # In the other, there's no actual link -- it's what - # Scholar renders as "CITATION" in the HTML: - # - #

- # - # [CITATION] - # [C] - # - # Honeycomb automated ids signature creation using honeypots - #

- # - # We now distinguish the two. - try: - atag = tag.h3.a - self.article['title'] = ''.join(atag.findAll(text=True)) - self.article['url'] = self._path2url(atag['href']) - if self.article['url'].endswith('.pdf'): - self.article['url_pdf'] = self.article['url'] - except: - # Remove a few spans that have unneeded content (e.g. [CITATION]) - for span in tag.h3.findAll(name='span'): - span.clear() - self.article['title'] = ''.join(tag.h3.findAll(text=True)) - - if tag.find('div', {'class': 'gs_a'}): - year = self.year_re.findall(tag.find('div', {'class': 'gs_a'}).text) - self.article['year'] = year[0] if len(year) > 0 else None - - if tag.find('div', {'class': 'gs_fl'}): - self._parse_links(tag.find('div', {'class': 'gs_fl'})) - - if tag.find('div', {'class': 'gs_rs'}): - # These are the content excerpts rendered into the results. - raw_text = tag.find('div', {'class': 'gs_rs'}).findAll(text=True) - if len(raw_text) > 0: - raw_text = ''.join(raw_text) - raw_text = raw_text.replace('\n', '') - self.article['excerpt'] = raw_text - + """ + ScholarArticleParser can parse HTML document strings obtained from + Google Scholar. This is a base class; concrete implementations + adapting to tweaks made by Google over time follow below. + """ + def __init__(self, site=None): + self.soup = None + self.article = None + self.site = site or ScholarConf.SCHOLAR_SITE + self.year_re = re.compile(r'\b(?:20|19)\d{2}\b') + + def handle_article(self, art): + """ + The parser invokes this callback on each article parsed + successfully. In this base class, the callback does nothing. + """ + + def handle_num_results(self, num_results): + """ + The parser invokes this callback if it determines the overall + number of results, as reported on the parsed results page. The + base class implementation does nothing. + """ + + def parse(self, html): + """ + This method initiates parsing of HTML content, cleans resulting + content as needed, and notifies the parser instance of + resulting instances via the handle_article callback. + """ + self.soup = SoupKitchen.make_soup(html) + + # This parses any global, non-itemized attributes from the page. + self._parse_globals() + + # Now parse out listed articles: + for div in self.soup.findAll(ScholarArticleParser._tag_results_checker): + self._parse_article(div) + self._clean_article() + if self.article['title']: + self.handle_article(self.article) + + def _clean_article(self): + """ + This gets invoked after we have parsed an article, to do any + needed cleanup/polishing before we hand off the resulting + article. + """ + if self.article['title']: + self.article['title'] = self.article['title'].strip() + + def _parse_globals(self): + tag = self.soup.find(name='div', attrs={'id': 'gs_ab_md'}) + if tag is not None: + raw_text = tag.findAll(text=True) + # raw text is a list because the body contains etc + if raw_text is not None and len(raw_text) > 0: + try: + num_results = raw_text[0].split()[1] + # num_results may now contain commas to separate + # thousands, strip: + num_results = num_results.replace(',', '') + num_results = int(num_results) + self.handle_num_results(num_results) + except (IndexError, ValueError): + pass + + def _parse_article(self, div): + self.article = ScholarArticle() + + for tag in div: + if not hasattr(tag, 'name'): + continue + if str(tag).lower().find('.pdf'): + if tag.find('div', {'class': 'gs_ttss'}): + self._parse_links(tag.find('div', {'class': 'gs_ttss'})) + + if tag.name == 'div' and self._tag_has_class(tag, 'gs_ri'): + # There are (at least) two formats here. In the first + # one, we have a link, e.g.: + # + #

+ # + # Honeycomb: creating intrusion detection signatures using + # honeypots + # + #

+ # + # In the other, there's no actual link -- it's what + # Scholar renders as "CITATION" in the HTML: + # + #

+ # + # [CITATION] + # [C] + # + # Honeycomb automated ids signature creation using honeypots + #

+ # + # We now distinguish the two. + try: + atag = tag.h3.a + self.article['title'] = ''.join(atag.findAll(text=True)) + self.article['url'] = self._path2url(atag['href']) + if self.article['url'].endswith('.pdf'): + self.article['url_pdf'] = self.article['url'] + except: + # Remove a few spans that have unneeded content (e.g. [CITATION]) + for span in tag.h3.findAll(name='span'): + span.clear() + self.article['title'] = ''.join(tag.h3.findAll(text=True)) + + if tag.find('div', {'class': 'gs_a'}): + year = self.year_re.findall(tag.find('div', {'class': 'gs_a'}).text) + self.article['year'] = year[0] if len(year) > 0 else None + + authors = tag.find('div', class_='gs_a') + authors = authors.get_text() + authors = re.sub(r"-.*", "", authors) + self.article['authors'] = [e.strip() for e in authors.split(", ")] + + if tag.find('div', {'class': 'gs_fl'}): + self._parse_links(tag.find('div', {'class': 'gs_fl'})) + + if tag.find('div', {'class': 'gs_rs'}): + # These are the content excerpts rendered into the results. + raw_text = tag.find('div', {'class': 'gs_rs'}).findAll(text=True) + if len(raw_text) > 0: + raw_text = ''.join(raw_text) + raw_text = raw_text.replace('\n', '') + self.article['excerpt'] = raw_text + + def _parse_links(self, span): + for tag in span: + if not hasattr(tag, 'name'): + continue + if tag.name != 'a' or tag.get('href') is None: + continue + + if tag.get('href').startswith('/scholar?cites'): + if hasattr(tag, 'string') and tag.string.startswith('Cited by'): + self.article['num_citations'] = \ + self._as_int(tag.string.split()[-1]) + + # Weird Google Scholar behavior here: if the original + # search query came with a number-of-results limit, + # then this limit gets propagated to the URLs embedded + # in the results page as well. Same applies to + # versions URL in next if-block. + self.article['url_citations'] = \ + self._strip_url_arg('num', self._path2url(tag.get('href'))) + + # We can also extract the cluster ID from the versions + # URL. Note that we know that the string contains "?", + # from the above if-statement. + args = self.article['url_citations'].split('?', 1)[1] + for arg in args.split('&'): + if arg.startswith('cites='): + self.article['cluster_id'] = arg[6:] + + if tag.get('href').startswith('/scholar?cluster'): + if hasattr(tag, 'string') and tag.string.startswith('All '): + self.article['num_versions'] = \ + self._as_int(tag.string.split()[1]) + self.article['url_versions'] = \ + self._strip_url_arg('num', self._path2url(tag.get('href'))) + + if tag.getText().startswith('Import'): + self.article['url_citation'] = self._path2url(tag.get('href')) + + + @staticmethod + def _tag_has_class(tag, klass): + """ + This predicate function checks whether a BeatifulSoup Tag instance + has a class attribute. + """ + res = tag.get('class') or [] + if type(res) != list: + # BeautifulSoup 3 can return e.g. 'gs_md_wp gs_ttss', + # so split -- conveniently produces a list in any case + res = res.split() + return klass in res + + @staticmethod + def _tag_results_checker(tag): + return tag.name == 'div' \ + and ScholarArticleParser._tag_has_class(tag, 'gs_r') + + @staticmethod + def _as_int(obj): + try: + return int(obj) + except ValueError: + return None + + def _path2url(self, path): + """Helper, returns full URL in case path isn't one.""" + if path.startswith('http://') or path.startswith('https://'): + return path + if not path.startswith('/'): + path = '/' + path + return self.site + path + + def _strip_url_arg(self, arg, url): + """Helper, removes a URL-encoded argument, if present.""" + parts = url.split('?', 1) + if len(parts) != 2: + return url + res = [] + for part in parts[1].split('&'): + if not part.startswith(arg + '='): + res.append(part) + return parts[0] + '?' + '&'.join(res) class ScholarQuery(object): - """ - The base class for any kind of results query we send to Scholar. - """ - def __init__(self): - self.url = None - - # The number of results requested from Scholar -- not the - # total number of results it reports (the latter gets stored - # in attrs, see below). - self.num_results = None - - # Queries may have global result attributes, similar to - # per-article attributes in ScholarArticle. The exact set of - # attributes may differ by query type, but they all share the - # basic data structure: - self.attrs = {} - - def set_num_page_results(self, num_page_results): - self.num_results = ScholarUtils.ensure_int( - num_page_results, - 'maximum number of results on page must be numeric') - - def get_url(self): - """ - Returns a complete, submittable URL string for this particular - query instance. The URL and its arguments will vary depending - on the query. - """ - return None - - def _add_attribute_type(self, key, label, default_value=None): - """ - Adds a new type of attribute to the list of attributes - understood by this query. Meant to be used by the constructors - in derived classes. - """ - if len(self.attrs) == 0: - self.attrs[key] = [default_value, label, 0] - return - idx = max([item[2] for item in self.attrs.values()]) + 1 - self.attrs[key] = [default_value, label, idx] - - def __getitem__(self, key): - """Getter for attribute value. Returns None if no such key.""" - if key in self.attrs: - return self.attrs[key][0] - return None - - def __setitem__(self, key, item): - """Setter for attribute value. Does nothing if no such key.""" - if key in self.attrs: - self.attrs[key][0] = item - - def _parenthesize_phrases(self, query): - """ - Turns a query string containing comma-separated phrases into a - space-separated list of tokens, quoted if containing - whitespace. For example, input - - 'some words, foo, bar' - - becomes - - '"some words" foo bar' - - This comes in handy during the composition of certain queries. - """ - if query.find(',') < 0: - return query - phrases = [] - for phrase in query.split(','): - phrase = phrase.strip() - if phrase.find(' ') > 0: - phrase = '"' + phrase + '"' - phrases.append(phrase) - return ' '.join(phrases) + """ + The base class for any kind of results query we send to Scholar. + """ + def __init__(self): + self.url = None + + # The number of results requested from Scholar -- not the + # total number of results it reports (the latter gets stored + # in attrs, see below). + self.num_results = None + + # Queries may have global result attributes, similar to + # per-article attributes in ScholarArticle. The exact set of + # attributes may differ by query type, but they all share the + # basic data structure: + self.attrs = {} + + def set_num_page_results(self, num_page_results): + self.num_results = ScholarUtils.ensure_int( + num_page_results, + 'maximum number of results on page must be numeric') + + def get_url(self): + """ + Returns a complete, submittable URL string for this particular + query instance. The URL and its arguments will vary depending + on the query. + """ + return None + + def _add_attribute_type(self, key, label, default_value=None): + """ + Adds a new type of attribute to the list of attributes + understood by this query. Meant to be used by the constructors + in derived classes. + """ + if len(self.attrs) == 0: + self.attrs[key] = [default_value, label, 0] + return + idx = max([item[2] for item in self.attrs.values()]) + 1 + self.attrs[key] = [default_value, label, idx] + + def __getitem__(self, key): + """Getter for attribute value. Returns None if no such key.""" + if key in self.attrs: + return self.attrs[key][0] + return None + + def __setitem__(self, key, item): + """Setter for attribute value. Does nothing if no such key.""" + if key in self.attrs: + self.attrs[key][0] = item + + def _parenthesize_phrases(self, query): + """ + Turns a query string containing comma-separated phrases into a + space-separated list of tokens, quoted if containing + whitespace. For example, input + + 'some words, foo, bar' + + becomes + + '"some words" foo bar' + + This comes in handy during the composition of certain queries. + """ + if query.find(',') < 0: + return query + phrases = [] + for phrase in query.split(','): + phrase = phrase.strip() + if phrase.find(' ') > 0: + phrase = '"' + phrase + '"' + phrases.append(phrase) + return ' '.join(phrases) class ClusterScholarQuery(ScholarQuery): - """ - This version just pulls up an article cluster whose ID we already - know about. - """ - SCHOLAR_CLUSTER_URL = ScholarConf.SCHOLAR_SITE + '/scholar?' \ - + 'cluster=%(cluster)s' \ - + '%(num)s' + """ + This version just pulls up an article cluster whose ID we already + know about. + """ + SCHOLAR_CLUSTER_URL = ScholarConf.SCHOLAR_SITE + '/scholar?' \ + + 'cluster=%(cluster)s' \ + + '%(num)s' - def __init__(self, cluster=None): - ScholarQuery.__init__(self) - self._add_attribute_type('num_results', 'Results', 0) - self.cluster = None - self.set_cluster(cluster) + def __init__(self, cluster=None): + ScholarQuery.__init__(self) + self._add_attribute_type('num_results', 'Results', 0) + self.cluster = None + self.set_cluster(cluster) - def set_cluster(self, cluster): - """ - Sets search to a Google Scholar results cluster ID. - """ - msg = 'cluster ID must be numeric' - self.cluster = ScholarUtils.ensure_int(cluster, msg) + def set_cluster(self, cluster): + """ + Sets search to a Google Scholar results cluster ID. + """ + msg = 'cluster ID must be numeric' + self.cluster = ScholarUtils.ensure_int(cluster, msg) - def get_url(self): - if self.cluster is None: - raise QueryArgumentError('cluster query needs cluster ID') + def get_url(self): + if self.cluster is None: + raise QueryArgumentError('cluster query needs cluster ID') - urlargs = {'cluster': self.cluster } + urlargs = {'cluster': self.cluster } - for key, val in urlargs.items(): - urlargs[key] = quote(encode(val)) + for key, val in urlargs.items(): + urlargs[key] = quote(encode(val)) - # The following URL arguments must not be quoted, or the - # server will not recognize them: - urlargs['num'] = ('&num=%d' % self.num_results - if self.num_results is not None else '') + # The following URL arguments must not be quoted, or the + # server will not recognize them: + urlargs['num'] = ('&num=%d' % self.num_results + if self.num_results is not None else '') - return self.SCHOLAR_CLUSTER_URL % urlargs + return self.SCHOLAR_CLUSTER_URL % urlargs class SearchScholarQuery(ScholarQuery): - """ - This version represents the search query parameters the user can - configure on the Scholar website, in the advanced search options. - """ - SCHOLAR_QUERY_URL = ScholarConf.SCHOLAR_SITE + '/scholar?' \ - + 'as_q=%(words)s' \ - + '&as_epq=%(phrase)s' \ - + '&as_oq=%(words_some)s' \ - + '&as_eq=%(words_none)s' \ - + '&as_occt=%(scope)s' \ - + '&as_sauthors=%(authors)s' \ - + '&as_publication=%(pub)s' \ - + '&as_ylo=%(ylo)s' \ - + '&as_yhi=%(yhi)s' \ - + '&as_vis=%(citations)s' \ - + '&btnG=&hl=en' \ - + '%(num)s' \ - + '&as_sdt=%(patents)s%%2C5' - - def __init__(self): - ScholarQuery.__init__(self) - self._add_attribute_type('num_results', 'Results', 0) - self.words = None # The default search behavior - self.words_some = None # At least one of those words - self.words_none = None # None of these words - self.phrase = None - self.scope_title = False # If True, search in title only - self.author = None - self.pub = None - self.timeframe = [None, None] - self.include_patents = True - self.include_citations = True - - def set_words(self, words): - """Sets words that *all* must be found in the result.""" - self.words = words - - def set_words_some(self, words): - """Sets words of which *at least one* must be found in result.""" - self.words_some = words - - def set_words_none(self, words): - """Sets words of which *none* must be found in the result.""" - self.words_none = words - - def set_phrase(self, phrase): - """Sets phrase that must be found in the result exactly.""" - self.phrase = phrase - - def set_scope(self, title_only): - """ - Sets Boolean indicating whether to search entire article or title - only. - """ - self.scope_title = title_only - - def set_author(self, author): - """Sets names that must be on the result's author list.""" - self.author = author - - def set_pub(self, pub): - """Sets the publication in which the result must be found.""" - self.pub = pub - - def set_timeframe(self, start=None, end=None): - """ - Sets timeframe (in years as integer) in which result must have - appeared. It's fine to specify just start or end, or both. - """ - if start: - start = ScholarUtils.ensure_int(start) - if end: - end = ScholarUtils.ensure_int(end) - self.timeframe = [start, end] - - def set_include_citations(self, yesorno): - self.include_citations = yesorno - - def set_include_patents(self, yesorno): - self.include_patents = yesorno - - def get_url(self): - if self.words is None and self.words_some is None \ - and self.words_none is None and self.phrase is None \ - and self.author is None and self.pub is None \ - and self.timeframe[0] is None and self.timeframe[1] is None: - raise QueryArgumentError('search query needs more parameters') - - # If we have some-words or none-words lists, we need to - # process them so GS understands them. For simple - # space-separeted word lists, there's nothing to do. For lists - # of phrases we have to ensure quotations around the phrases, - # separating them by whitespace. - words_some = None - words_none = None - - if self.words_some: - words_some = self._parenthesize_phrases(self.words_some) - if self.words_none: - words_none = self._parenthesize_phrases(self.words_none) - - urlargs = {'words': self.words or '', - 'words_some': words_some or '', - 'words_none': words_none or '', - 'phrase': self.phrase or '', - 'scope': 'title' if self.scope_title else 'any', - 'authors': self.author or '', - 'pub': self.pub or '', - 'ylo': self.timeframe[0] or '', - 'yhi': self.timeframe[1] or '', - 'patents': '0' if self.include_patents else '1', - 'citations': '0' if self.include_citations else '1'} - - for key, val in urlargs.items(): - urlargs[key] = quote(encode(val)) - - # The following URL arguments must not be quoted, or the - # server will not recognize them: - urlargs['num'] = ('&num=%d' % self.num_results - if self.num_results is not None else '') - - return self.SCHOLAR_QUERY_URL % urlargs + """ + This version represents the search query parameters the user can + configure on the Scholar website, in the advanced search options. + """ + SCHOLAR_QUERY_URL = ScholarConf.SCHOLAR_SITE + '/scholar?' \ + + 'as_q=%(words)s' \ + + '&as_epq=%(phrase)s' \ + + '&as_oq=%(words_some)s' \ + + '&as_eq=%(words_none)s' \ + + '&as_occt=%(scope)s' \ + + '&as_sauthors=%(authors)s' \ + + '&as_publication=%(pub)s' \ + + '&as_ylo=%(ylo)s' \ + + '&as_yhi=%(yhi)s' \ + + '&as_vis=%(citations)s' \ + + '&btnG=&hl=en' \ + + '%(num)s' \ + + '&as_sdt=%(patents)s%%2C5' + + def __init__(self): + ScholarQuery.__init__(self) + self._add_attribute_type('num_results', 'Results', 0) + self.words = None # The default search behavior + self.words_some = None # At least one of those words + self.words_none = None # None of these words + self.phrase = None + self.scope_title = False # If True, search in title only + self.author = None + self.pub = None + self.timeframe = [None, None] + self.include_patents = True + self.include_citations = True + + def set_words(self, words): + """Sets words that *all* must be found in the result.""" + self.words = words + + def set_words_some(self, words): + """Sets words of which *at least one* must be found in result.""" + self.words_some = words + + def set_words_none(self, words): + """Sets words of which *none* must be found in the result.""" + self.words_none = words + + def set_phrase(self, phrase): + """Sets phrase that must be found in the result exactly.""" + self.phrase = phrase + + def set_scope(self, title_only): + """ + Sets Boolean indicating whether to search entire article or title + only. + """ + self.scope_title = title_only + + def set_author(self, author): + """Sets names that must be on the result's author list.""" + self.author = author + + def set_pub(self, pub): + """Sets the publication in which the result must be found.""" + self.pub = pub + + def set_timeframe(self, start=None, end=None): + """ + Sets timeframe (in years as integer) in which result must have + appeared. It's fine to specify just start or end, or both. + """ + if start: + start = ScholarUtils.ensure_int(start) + if end: + end = ScholarUtils.ensure_int(end) + self.timeframe = [start, end] + + def set_include_citations(self, yesorno): + self.include_citations = yesorno + + def set_include_patents(self, yesorno): + self.include_patents = yesorno + + def get_url(self): + if self.words is None and self.words_some is None \ + and self.words_none is None and self.phrase is None \ + and self.author is None and self.pub is None \ + and self.timeframe[0] is None and self.timeframe[1] is None: + raise QueryArgumentError('search query needs more parameters') + + # If we have some-words or none-words lists, we need to + # process them so GS understands them. For simple + # space-separeted word lists, there's nothing to do. For lists + # of phrases we have to ensure quotations around the phrases, + # separating them by whitespace. + words_some = None + words_none = None + + if self.words_some: + words_some = self._parenthesize_phrases(self.words_some) + if self.words_none: + words_none = self._parenthesize_phrases(self.words_none) + + urlargs = {'words': self.words or '', + 'words_some': words_some or '', + 'words_none': words_none or '', + 'phrase': self.phrase or '', + 'scope': 'title' if self.scope_title else 'any', + 'authors': self.author or '', + 'pub': self.pub or '', + 'ylo': self.timeframe[0] or '', + 'yhi': self.timeframe[1] or '', + 'patents': '0' if self.include_patents else '1', + 'citations': '0' if self.include_citations else '1'} + + for key, val in urlargs.items(): + urlargs[key] = quote(encode(val)) + + # The following URL arguments must not be quoted, or the + # server will not recognize them: + urlargs['num'] = ('&num=%d' % self.num_results + if self.num_results is not None else '') + + return self.SCHOLAR_QUERY_URL % urlargs class ScholarSettings(object): - """ - This class lets you adjust the Scholar settings for your - session. It's intended to mirror the features tunable in the - Scholar Settings pane, but right now it's a bit basic. - """ - CITFORM_NONE = 0 - CITFORM_REFWORKS = 1 - CITFORM_REFMAN = 2 - CITFORM_ENDNOTE = 3 - CITFORM_BIBTEX = 4 - - def __init__(self): - self.citform = 0 # Citation format, default none - self.per_page_results = None - self._is_configured = False - - def set_citation_format(self, citform): - citform = ScholarUtils.ensure_int(citform) - if citform < 0 or citform > self.CITFORM_BIBTEX: - raise FormatError('citation format invalid, is "%s"' - % citform) - self.citform = citform - self._is_configured = True - - def set_per_page_results(self, per_page_results): - self.per_page_results = ScholarUtils.ensure_int( - per_page_results, 'page results must be integer') - self.per_page_results = min( - self.per_page_results, ScholarConf.MAX_PAGE_RESULTS) - self._is_configured = True - - def is_configured(self): - return self._is_configured + """ + This class lets you adjust the Scholar settings for your + session. It's intended to mirror the features tunable in the + Scholar Settings pane, but right now it's a bit basic. + """ + CITFORM_NONE = 0 + CITFORM_REFWORKS = 1 + CITFORM_REFMAN = 2 + CITFORM_ENDNOTE = 3 + CITFORM_BIBTEX = 4 + + def __init__(self): + self.citform = 0 # Citation format, default none + self.per_page_results = None + self._is_configured = False + + def set_citation_format(self, citform): + citform = ScholarUtils.ensure_int(citform) + if citform < 0 or citform > self.CITFORM_BIBTEX: + raise FormatError('citation format invalid, is "%s"' + % citform) + self.citform = citform + self._is_configured = True + + def set_per_page_results(self, per_page_results): + self.per_page_results = ScholarUtils.ensure_int( + per_page_results, 'page results must be integer') + self.per_page_results = min( + self.per_page_results, ScholarConf.MAX_PAGE_RESULTS) + self._is_configured = True + + def is_configured(self): + return self._is_configured class ScholarQuerier(object): - """ - ScholarQuerier instances can conduct a search on Google Scholar - with subsequent parsing of the resulting HTML content. The - articles found are collected in the articles member, a list of - ScholarArticle instances. - """ - - # Default URLs for visiting and submitting Settings pane, as of 3/14 - GET_SETTINGS_URL = ScholarConf.SCHOLAR_SITE + '/scholar_settings?' \ - + 'sciifh=1&hl=en&as_sdt=0,5' - - SET_SETTINGS_URL = ScholarConf.SCHOLAR_SITE + '/scholar_setprefs?' \ - + 'q=' \ - + '&scisig=%(scisig)s' \ - + '&inststart=0' \ - + '&as_sdt=1,5' \ - + '&as_sdtp=' \ - + '&num=%(num)s' \ - + '&scis=%(scis)s' \ - + '%(scisf)s' \ - + '&hl=en&lang=all&instq=&inst=569367360547434339&save=' - - # Older URLs: - # ScholarConf.SCHOLAR_SITE + '/scholar?q=%s&hl=en&btnG=Search&as_sdt=2001&as_sdtp=on - - class Parser(ScholarArticleParser120726): - def __init__(self, querier): - ScholarArticleParser120726.__init__(self) - self.querier = querier - - def handle_num_results(self, num_results): - if self.querier is not None and self.querier.query is not None: - self.querier.query['num_results'] = num_results - - def handle_article(self, art): - self.querier.add_article(art) - - def __init__(self): - self.articles = [] - self.query = None - self.cjar = MozillaCookieJar() - - # If we have a cookie file, load it: - if ScholarConf.COOKIE_JAR_FILE and \ - os.path.exists(ScholarConf.COOKIE_JAR_FILE): - try: - self.cjar.load(ScholarConf.COOKIE_JAR_FILE, - ignore_discard=True) - ScholarUtils.log('info', 'loaded cookies file') - except Exception as msg: - ScholarUtils.log('warn', 'could not load cookies file: %s' % msg) - self.cjar = MozillaCookieJar() # Just to be safe - - self.opener = build_opener(HTTPCookieProcessor(self.cjar)) - self.settings = None # Last settings object, if any - - def apply_settings(self, settings): - """ - Applies settings as provided by a ScholarSettings instance. - """ - if settings is None or not settings.is_configured(): - return True - - self.settings = settings - - # This is a bit of work. We need to actually retrieve the - # contents of the Settings pane HTML in order to extract - # hidden fields before we can compose the query for updating - # the settings. - html = self._get_http_response(url=self.GET_SETTINGS_URL, - log_msg='dump of settings form HTML', - err_msg='requesting settings failed') - if html is None: - return False - - # Now parse the required stuff out of the form. We require the - # "scisig" token to make the upload of our settings acceptable - # to Google. - soup = SoupKitchen.make_soup(html) - - tag = soup.find(name='form', attrs={'id': 'gs_bdy_frm'}) - if tag is None: - ScholarUtils.log('info', 'parsing settings failed: no form') - return False - - tag = tag.find('input', attrs={'type':'hidden', 'name':'scisig'}) - if tag is None: - ScholarUtils.log('info', 'parsing settings failed: scisig') - return False - - urlargs = {'scisig': tag['value'], - 'num': settings.per_page_results, - 'scis': 'no', - 'scisf': ''} - - if settings.citform != 0: - urlargs['scis'] = 'yes' - urlargs['scisf'] = '&scisf=%d' % settings.citform - - html = self._get_http_response(url=self.SET_SETTINGS_URL % urlargs, - log_msg='dump of settings result HTML', - err_msg='applying setttings failed') - if html is None: - return False - - ScholarUtils.log('info', 'settings applied') - return True - - def send_query(self, query): - """ - This method initiates a search query (a ScholarQuery instance) - with subsequent parsing of the response. - """ - self.clear_articles() - self.query = query - - html = self._get_http_response(url=query.get_url(), - log_msg='dump of query response HTML', - err_msg='results retrieval failed') - if html is None: - return - - self.parse(html) - - def get_citation_data(self, article): - """ - Given an article, retrieves citation link. Note, this requires that - you adjusted the settings to tell Google Scholar to actually - provide this information, *prior* to retrieving the article. - """ - if article['url_citation'] is None: - return False - if article.citation_data is not None: - return True - - ScholarUtils.log('info', 'retrieving citation export data') - data = self._get_http_response(url=article['url_citation'], - log_msg='citation data response', - err_msg='requesting citation data failed') - if data is None: - return False - - article.set_citation_data(data) - return True - - def parse(self, html): - """ - This method allows parsing of provided HTML content. - """ - parser = self.Parser(self) - parser.parse(html) - - def add_article(self, art): - self.get_citation_data(art) - self.articles.append(art) - - def clear_articles(self): - """Clears any existing articles stored from previous queries.""" - self.articles = [] - - def save_cookies(self): - """ - This stores the latest cookies we're using to disk, for reuse in a - later session. - """ - if ScholarConf.COOKIE_JAR_FILE is None: - return False - try: - self.cjar.save(ScholarConf.COOKIE_JAR_FILE, - ignore_discard=True) - ScholarUtils.log('info', 'saved cookies file') - return True - except Exception as msg: - ScholarUtils.log('warn', 'could not save cookies file: %s' % msg) - return False - - def _get_http_response(self, url, log_msg=None, err_msg=None): - """ - Helper method, sends HTTP request and returns response payload. - """ - if log_msg is None: - log_msg = 'HTTP response data follow' - if err_msg is None: - err_msg = 'request failed' - try: - ScholarUtils.log('info', 'requesting %s' % unquote(url)) - - req = Request(url=url, headers={'User-Agent': ScholarConf.USER_AGENT}) - hdl = self.opener.open(req) - html = hdl.read() - - ScholarUtils.log('debug', log_msg) - ScholarUtils.log('debug', '>>>>' + '-'*68) - ScholarUtils.log('debug', 'url: %s' % hdl.geturl()) - ScholarUtils.log('debug', 'result: %s' % hdl.getcode()) - ScholarUtils.log('debug', 'headers:\n' + str(hdl.info())) - ScholarUtils.log('debug', 'data:\n' + html.decode('utf-8')) # For Python 3 - ScholarUtils.log('debug', '<<<<' + '-'*68) - - return html - except Exception as err: - ScholarUtils.log('info', err_msg + ': %s' % err) - return None + """ + ScholarQuerier instances can conduct a search on Google Scholar + with subsequent parsing of the resulting HTML content. The + articles found are collected in the articles member, a list of + ScholarArticle instances. + """ + + # Default URLs for visiting and submitting Settings pane, as of 3/14 + GET_SETTINGS_URL = ScholarConf.SCHOLAR_SITE + '/scholar_settings?' \ + + 'sciifh=1&hl=en&as_sdt=0,5' + + SET_SETTINGS_URL = ScholarConf.SCHOLAR_SITE + '/scholar_setprefs?' \ + + 'q=' \ + + '&scisig=%(scisig)s' \ + + '&inststart=0' \ + + '&as_sdt=1,5' \ + + '&as_sdtp=' \ + + '&num=%(num)s' \ + + '&scis=%(scis)s' \ + + '%(scisf)s' \ + + '&hl=en&lang=all&instq=&inst=569367360547434339&save=' + + # Older URLs: + # ScholarConf.SCHOLAR_SITE + '/scholar?q=%s&hl=en&btnG=Search&as_sdt=2001&as_sdtp=on + + class Parser(ScholarArticleParser): + def __init__(self, querier): + ScholarArticleParser.__init__(self) + self.querier = querier + + def handle_num_results(self, num_results): + if self.querier is not None and self.querier.query is not None: + self.querier.query['num_results'] = num_results + + def handle_article(self, art): + self.querier.add_article(art) + + def __init__(self): + self.articles = [] + self.query = None + self.cjar = MozillaCookieJar() + + # If we have a cookie file, load it: + if ScholarConf.COOKIE_JAR_FILE and \ + os.path.exists(ScholarConf.COOKIE_JAR_FILE): + try: + self.cjar.load(ScholarConf.COOKIE_JAR_FILE, + ignore_discard=True) + ScholarUtils.log('info', 'loaded cookies file') + except Exception as msg: + ScholarUtils.log('warn', 'could not load cookies file: %s' % msg) + self.cjar = MozillaCookieJar() # Just to be safe + + self.opener = build_opener(HTTPCookieProcessor(self.cjar)) + self.settings = None # Last settings object, if any + + def apply_settings(self, settings): + """ + Applies settings as provided by a ScholarSettings instance. + """ + if settings is None or not settings.is_configured(): + return True + + self.settings = settings + + # This is a bit of work. We need to actually retrieve the + # contents of the Settings pane HTML in order to extract + # hidden fields before we can compose the query for updating + # the settings. + html = self._get_http_response(url=self.GET_SETTINGS_URL, + log_msg='dump of settings form HTML', + err_msg='requesting settings failed') + if html is None: + return False + + # Now parse the required stuff out of the form. We require the + # "scisig" token to make the upload of our settings acceptable + # to Google. + soup = SoupKitchen.make_soup(html) + + tag = soup.find(name='form', attrs={'id': 'gs_bdy_frm'}) + if tag is None: + ScholarUtils.log('info', 'parsing settings failed: no form') + return False + + tag = tag.find('input', attrs={'type':'hidden', 'name':'scisig'}) + if tag is None: + ScholarUtils.log('info', 'parsing settings failed: scisig') + return False + + urlargs = {'scisig': tag['value'], + 'num': settings.per_page_results, + 'scis': 'no', + 'scisf': ''} + + if settings.citform != 0: + urlargs['scis'] = 'yes' + urlargs['scisf'] = '&scisf=%d' % settings.citform + + html = self._get_http_response(url=self.SET_SETTINGS_URL % urlargs, + log_msg='dump of settings result HTML', + err_msg='applying setttings failed') + if html is None: + return False + + ScholarUtils.log('info', 'settings applied') + return True + + def send_query(self, query): + """ + This method initiates a search query (a ScholarQuery instance) + with subsequent parsing of the response. + """ + self.clear_articles() + self.query = query + + html = self._get_http_response(url=query.get_url(), + log_msg='dump of query response HTML', + err_msg='results retrieval failed') + if html is None: + return + + self.parse(html) + + def get_citation_data(self, article): + """ + Given an article, retrieves citation link. Note, this requires that + you adjusted the settings to tell Google Scholar to actually + provide this information, *prior* to retrieving the article. + """ + if article['url_citation'] is None: + return False + if article.citation_data is not None: + return True + + ScholarUtils.log('info', 'retrieving citation export data') + data = self._get_http_response(url=article['url_citation'], + log_msg='citation data response', + err_msg='requesting citation data failed') + if data is None: + return False + + article.set_citation_data(data) + return True + + def parse(self, html): + """ + This method allows parsing of provided HTML content. + """ + parser = self.Parser(self) + parser.parse(html) + + def add_article(self, art): + self.get_citation_data(art) + self.articles.append(art) + + def clear_articles(self): + """Clears any existing articles stored from previous queries.""" + self.articles = [] + + def save_cookies(self): + """ + This stores the latest cookies we're using to disk, for reuse in a + later session. + """ + if ScholarConf.COOKIE_JAR_FILE is None: + return False + try: + self.cjar.save(ScholarConf.COOKIE_JAR_FILE, + ignore_discard=True) + ScholarUtils.log('info', 'saved cookies file') + return True + except Exception as msg: + ScholarUtils.log('warn', 'could not save cookies file: %s' % msg) + return False + + def _get_http_response(self, url, log_msg=None, err_msg=None): + """ + Helper method, sends HTTP request and returns response payload. + """ + if log_msg is None: + log_msg = 'HTTP response data follow' + if err_msg is None: + err_msg = 'request failed' + try: + ScholarUtils.log('info', 'requesting %s' % unquote(url)) + + req = Request(url=url, headers={'User-Agent': ScholarConf.USER_AGENT}) + hdl = self.opener.open(req) + html = hdl.read() + + ScholarUtils.log('debug', log_msg) + ScholarUtils.log('debug', '>>>>' + '-'*68) + ScholarUtils.log('debug', 'url: %s' % hdl.geturl()) + ScholarUtils.log('debug', 'result: %s' % hdl.getcode()) + ScholarUtils.log('debug', 'headers:\n' + str(hdl.info())) + ScholarUtils.log('debug', 'data:\n' + html.decode('utf-8')) # For Python 3 + ScholarUtils.log('debug', '<<<<' + '-'*68) + + return html + except Exception as err: + ScholarUtils.log('info', err_msg + ': %s' % err) + return None def txt(querier, with_globals): - if with_globals: - # If we have any articles, check their attribute labels to get - # the maximum length -- makes for nicer alignment. - max_label_len = 0 - if len(querier.articles) > 0: - items = sorted(list(querier.articles[0].attrs.values()), - key=lambda item: item[2]) - max_label_len = max([len(str(item[1])) for item in items]) - - # Get items sorted in specified order: - items = sorted(list(querier.query.attrs.values()), key=lambda item: item[2]) - # Find largest label length: - max_label_len = max([len(str(item[1])) for item in items] + [max_label_len]) - fmt = '[G] %%%ds %%s' % max(0, max_label_len-4) - for item in items: - if item[0] is not None: - print(fmt % (item[1], item[0])) - if len(items) > 0: - print - - articles = querier.articles - for art in articles: - print(encode(art.as_txt()) + '\n') + if with_globals: + # If we have any articles, check their attribute labels to get + # the maximum length -- makes for nicer alignment. + max_label_len = 0 + if len(querier.articles) > 0: + items = sorted(list(querier.articles[0].attrs.values()), + key=lambda item: item[2]) + max_label_len = max([len(str(item[1])) for item in items]) + + # Get items sorted in specified order: + items = sorted(list(querier.query.attrs.values()), key=lambda item: item[2]) + # Find largest label length: + max_label_len = max([len(str(item[1])) for item in items] + [max_label_len]) + fmt = '[G] %%%ds %%s' % max(0, max_label_len-4) + for item in items: + if item[0] is not None: + print(fmt % (item[1], item[0])) + if len(items) > 0: + print + + articles = querier.articles + for art in articles: + print(encode(art.as_txt()) + '\n') def json(querier): - return [art.as_json() for art in querier.articles] + return [art.as_json() for art in querier.articles] def csv(querier, header=False, sep='|'): - articles = querier.articles - for art in articles: - result = art.as_csv(header=header, sep=sep) - print(encode(result)) - header = False + articles = querier.articles + for art in articles: + result = art.as_csv(header=header, sep=sep) + print(encode(result)) + header = False def citation_export(querier): - articles = querier.articles - for art in articles: - print(art.as_citation() + '\n') + articles = querier.articles + for art in articles: + print(art.as_citation() + '\n') def main(): - usage = """scholar.py [options] + usage = """scholar.py [options] A command-line interface to Google Scholar. Examples: @@ -1167,150 +1118,150 @@ def main(): # does not contain the words "quantum" and "theory": scholar.py -c 5 -a "albert einstein" -t --none "quantum theory" --after 1970""" - fmt = optparse.IndentedHelpFormatter(max_help_position=50, width=100) - parser = optparse.OptionParser(usage=usage, formatter=fmt) - group = optparse.OptionGroup(parser, 'Query arguments', - 'These options define search query arguments and parameters.') - group.add_option('-a', '--author', metavar='AUTHORS', default=None, - help='Author name(s)') - group.add_option('-A', '--all', metavar='WORDS', default=None, dest='allw', - help='Results must contain all of these words') - group.add_option('-s', '--some', metavar='WORDS', default=None, - help='Results must contain at least one of these words. Pass arguments in form -s "foo bar baz" for simple words, and -s "a phrase, another phrase" for phrases') - group.add_option('-n', '--none', metavar='WORDS', default=None, - help='Results must contain none of these words. See -s|--some re. formatting') - group.add_option('-p', '--phrase', metavar='PHRASE', default=None, - help='Results must contain exact phrase') - group.add_option('-t', '--title-only', action='store_true', default=False, - help='Search title only') - group.add_option('-P', '--pub', metavar='PUBLICATIONS', default=None, - help='Results must have appeared in this publication') - group.add_option('--after', metavar='YEAR', default=None, - help='Results must have appeared in or after given year') - group.add_option('--before', metavar='YEAR', default=None, - help='Results must have appeared in or before given year') - group.add_option('--no-patents', action='store_true', default=False, - help='Do not include patents in results') - group.add_option('--no-citations', action='store_true', default=False, - help='Do not include citations in results') - group.add_option('-C', '--cluster-id', metavar='CLUSTER_ID', default=None, - help='Do not search, just use articles in given cluster ID') - group.add_option('-c', '--count', type='int', default=None, - help='Maximum number of results') - parser.add_option_group(group) - - group = optparse.OptionGroup(parser, 'Output format', - 'These options control the appearance of the results.') - group.add_option('--txt', action='store_true', - help='Print article data in text format (default)') - group.add_option('--txt-globals', action='store_true', - help='Like --txt, but first print global results too') - group.add_option('--csv', action='store_true', - help='Print article data in CSV form (separator is "|")') - group.add_option('--csv-header', action='store_true', - help='Like --csv, but print header with column names') - group.add_option('--citation', metavar='FORMAT', default=None, - help='Print article details in standard citation format. Argument Must be one of "bt" (BibTeX), "en" (EndNote), "rm" (RefMan), or "rw" (RefWorks).') - parser.add_option_group(group) - - group = optparse.OptionGroup(parser, 'Miscellaneous') - group.add_option('--cookie-file', metavar='FILE', default=None, - help='File to use for cookie storage. If given, will read any existing cookies if found at startup, and save resulting cookies in the end.') - group.add_option('-d', '--debug', action='count', default=0, - help='Enable verbose logging to stderr. Repeated options increase detail of debug output.') - group.add_option('-v', '--version', action='store_true', default=False, - help='Show version information') - parser.add_option_group(group) - - options, _ = parser.parse_args() - - # Show help if we have neither keyword search nor author name - if len(sys.argv) == 1: - parser.print_help() - return 1 - - if options.debug > 0: - options.debug = min(options.debug, ScholarUtils.LOG_LEVELS['debug']) - ScholarConf.LOG_LEVEL = options.debug - ScholarUtils.log('info', 'using log level %d' % ScholarConf.LOG_LEVEL) - - if options.version: - print('This is scholar.py %s.' % ScholarConf.VERSION) - return 0 - - if options.cookie_file: - ScholarConf.COOKIE_JAR_FILE = options.cookie_file - - # Sanity-check the options: if they include a cluster ID query, it - # makes no sense to have search arguments: - if options.cluster_id is not None: - if options.author or options.allw or options.some or options.none \ - or options.phrase or options.title_only or options.pub \ - or options.after or options.before: - print('Cluster ID queries do not allow additional search arguments.') - return 1 - - querier = ScholarQuerier() - settings = ScholarSettings() - - if options.citation == 'bt': - settings.set_citation_format(ScholarSettings.CITFORM_BIBTEX) - elif options.citation == 'en': - settings.set_citation_format(ScholarSettings.CITFORM_ENDNOTE) - elif options.citation == 'rm': - settings.set_citation_format(ScholarSettings.CITFORM_REFMAN) - elif options.citation == 'rw': - settings.set_citation_format(ScholarSettings.CITFORM_REFWORKS) - elif options.citation is not None: - print('Invalid citation link format, must be one of "bt", "en", "rm", or "rw".') - return 1 - - querier.apply_settings(settings) - - if options.cluster_id: - query = ClusterScholarQuery(cluster=options.cluster_id) - else: - query = SearchScholarQuery() - if options.author: - query.set_author(options.author) - if options.allw: - query.set_words(options.allw) - if options.some: - query.set_words_some(options.some) - if options.none: - query.set_words_none(options.none) - if options.phrase: - query.set_phrase(options.phrase) - if options.title_only: - query.set_scope(True) - if options.pub: - query.set_pub(options.pub) - if options.after or options.before: - query.set_timeframe(options.after, options.before) - if options.no_patents: - query.set_include_patents(False) - if options.no_citations: - query.set_include_citations(False) - - if options.count is not None: - options.count = min(options.count, ScholarConf.MAX_PAGE_RESULTS) - query.set_num_page_results(options.count) - - querier.send_query(query) - - if options.csv: - csv(querier) - elif options.csv_header: - csv(querier, header=True) - elif options.citation is not None: - citation_export(querier) - else: - txt(querier, with_globals=options.txt_globals) - - if options.cookie_file: - querier.save_cookies() - - return 0 + fmt = optparse.IndentedHelpFormatter(max_help_position=50, width=100) + parser = optparse.OptionParser(usage=usage, formatter=fmt) + group = optparse.OptionGroup(parser, 'Query arguments', + 'These options define search query arguments and parameters.') + group.add_option('-a', '--author', metavar='AUTHORS', default=None, + help='Author name(s)') + group.add_option('-A', '--all', metavar='WORDS', default=None, dest='allw', + help='Results must contain all of these words') + group.add_option('-s', '--some', metavar='WORDS', default=None, + help='Results must contain at least one of these words. Pass arguments in form -s "foo bar baz" for simple words, and -s "a phrase, another phrase" for phrases') + group.add_option('-n', '--none', metavar='WORDS', default=None, + help='Results must contain none of these words. See -s|--some re. formatting') + group.add_option('-p', '--phrase', metavar='PHRASE', default=None, + help='Results must contain exact phrase') + group.add_option('-t', '--title-only', action='store_true', default=False, + help='Search title only') + group.add_option('-P', '--pub', metavar='PUBLICATIONS', default=None, + help='Results must have appeared in this publication') + group.add_option('--after', metavar='YEAR', default=None, + help='Results must have appeared in or after given year') + group.add_option('--before', metavar='YEAR', default=None, + help='Results must have appeared in or before given year') + group.add_option('--no-patents', action='store_true', default=False, + help='Do not include patents in results') + group.add_option('--no-citations', action='store_true', default=False, + help='Do not include citations in results') + group.add_option('-C', '--cluster-id', metavar='CLUSTER_ID', default=None, + help='Do not search, just use articles in given cluster ID') + group.add_option('-c', '--count', type='int', default=None, + help='Maximum number of results') + parser.add_option_group(group) + + group = optparse.OptionGroup(parser, 'Output format', + 'These options control the appearance of the results.') + group.add_option('--txt', action='store_true', + help='Print article data in text format (default)') + group.add_option('--txt-globals', action='store_true', + help='Like --txt, but first print global results too') + group.add_option('--csv', action='store_true', + help='Print article data in CSV form (separator is "|")') + group.add_option('--csv-header', action='store_true', + help='Like --csv, but print header with column names') + group.add_option('--citation', metavar='FORMAT', default=None, + help='Print article details in standard citation format. Argument Must be one of "bt" (BibTeX), "en" (EndNote), "rm" (RefMan), or "rw" (RefWorks).') + parser.add_option_group(group) + + group = optparse.OptionGroup(parser, 'Miscellaneous') + group.add_option('--cookie-file', metavar='FILE', default=None, + help='File to use for cookie storage. If given, will read any existing cookies if found at startup, and save resulting cookies in the end.') + group.add_option('-d', '--debug', action='count', default=0, + help='Enable verbose logging to stderr. Repeated options increase detail of debug output.') + group.add_option('-v', '--version', action='store_true', default=False, + help='Show version information') + parser.add_option_group(group) + + options, _ = parser.parse_args() + + # Show help if we have neither keyword search nor author name + if len(sys.argv) == 1: + parser.print_help() + return 1 + + if options.debug > 0: + options.debug = min(options.debug, ScholarUtils.LOG_LEVELS['debug']) + ScholarConf.LOG_LEVEL = options.debug + ScholarUtils.log('info', 'using log level %d' % ScholarConf.LOG_LEVEL) + + if options.version: + print('This is scholar.py %s.' % ScholarConf.VERSION) + return 0 + + if options.cookie_file: + ScholarConf.COOKIE_JAR_FILE = options.cookie_file + + # Sanity-check the options: if they include a cluster ID query, it + # makes no sense to have search arguments: + if options.cluster_id is not None: + if options.author or options.allw or options.some or options.none \ + or options.phrase or options.title_only or options.pub \ + or options.after or options.before: + print('Cluster ID queries do not allow additional search arguments.') + return 1 + + querier = ScholarQuerier() + settings = ScholarSettings() + + if options.citation == 'bt': + settings.set_citation_format(ScholarSettings.CITFORM_BIBTEX) + elif options.citation == 'en': + settings.set_citation_format(ScholarSettings.CITFORM_ENDNOTE) + elif options.citation == 'rm': + settings.set_citation_format(ScholarSettings.CITFORM_REFMAN) + elif options.citation == 'rw': + settings.set_citation_format(ScholarSettings.CITFORM_REFWORKS) + elif options.citation is not None: + print('Invalid citation link format, must be one of "bt", "en", "rm", or "rw".') + return 1 + + querier.apply_settings(settings) + + if options.cluster_id: + query = ClusterScholarQuery(cluster=options.cluster_id) + else: + query = SearchScholarQuery() + if options.author: + query.set_author(options.author) + if options.allw: + query.set_words(options.allw) + if options.some: + query.set_words_some(options.some) + if options.none: + query.set_words_none(options.none) + if options.phrase: + query.set_phrase(options.phrase) + if options.title_only: + query.set_scope(True) + if options.pub: + query.set_pub(options.pub) + if options.after or options.before: + query.set_timeframe(options.after, options.before) + if options.no_patents: + query.set_include_patents(False) + if options.no_citations: + query.set_include_citations(False) + + if options.count is not None: + options.count = min(options.count, ScholarConf.MAX_PAGE_RESULTS) + query.set_num_page_results(options.count) + + querier.send_query(query) + + if options.csv: + csv(querier) + elif options.csv_header: + csv(querier, header=True) + elif options.citation is not None: + citation_export(querier) + else: + txt(querier, with_globals=options.txt_globals) + + if options.cookie_file: + querier.save_cookies() + + return 0 if __name__ == "__main__": - sys.exit(main()) + sys.exit(main())