From 76844eec78a23f482a4e0dfe9684898a6ef35fb2 Mon Sep 17 00:00:00 2001 From: offline893 <158537145+offline893@users.noreply.github.com> Date: Wed, 17 Sep 2025 10:36:43 +0800 Subject: [PATCH] Dynamic Expert Load Balance with Zero-like-overhead (#2956) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ### Motivation Currently dynamically experts balancing would stop-the-world. Asynchronously expert load balancing would be better without flowing problems: Host-bound latency: There are many cpu operations during EPLB such as eplb-algorithm、creating p2p ops、and log2phy expert converting would spend long cpu time, as ~1s. Communication latency: The transfer time would cost much in the situation without nvlink. As the weight of an expert maybe transfer to multiple new positions, thus N times send/recv for one expert, with result long latency. We had tested that batch_isend_irecv cost more 100ms for 16 experts weight transmission in A2 server of ascend. SwiftBalancer would not stop-the-world anymore, in out test on NPU 1~2ms cost for each layer while benefit 5ms-8ms decode latency with ep_size = 64. The following updates have been made: 1、expert distribution recording with lower cost. 2、async cpu computing for eplb algo and other python operator. 3、new eplb algo with less expert rebalancing while almost the same effect. ### Proposed Change We will gradually migrate the EPLB logic to the VLLM community and implement a generalized design. Relevant RFC: https://github.com/vllm-project/vllm/issues/22246 The overall workflow involves: 474430541-23b06f58-23bc-44a3-a1be-00f268aeb15c 1. Record experts distribution during forward. We using expert_token_num after disptach instead of topk_ids, thus we got much smaller tensor shape to reduce cost of hbm recording and add-operator. 2. Do all-gather for experts distribution. Using all-gather instead of all-reduce as less traffic volume. 3. Wake up eplb worker process with experts distribution when num_iterations comes. Run eplb algorithm in eplb worker. 4. Generate p2p send/recv ops and other operator such as log2phy would cost long cpu time. 5. Lanch ibatch_send_recv in async_stream before forward. 6. After forward, wait for the ibatch_send_recv finish, then do uapte expert map and expert weights. ### Co-author Co-authored-by: raindaywhu raindaywhu@raindaywhu@ 163.con Co-authored-by: njuyuan yuanjl19@smail.nju.edu.cn Co-authored-by: qmkakaxi wjh1594260677@qq.com Co-authored-by: Skywalker-EP 173723846@qq.com - vLLM version: v0.10.2 - vLLM main: https://github.com/vllm-project/vllm/commit/567939953b7a9cb0ded6bf0bb21a76917b8fed97 --------- Signed-off-by: offline0806 Co-authored-by: offline0806 --- .../feature_guide/eplb_swift_balancer.md | 45 + .../feature_guide/images/eplb_img.png | Bin 0 -> 56081 bytes docs/source/user_guide/feature_guide/index.md | 1 + .../test_determin_expert_map_all.py | 0 vllm_ascend/ascend_config.py | 13 +- vllm_ascend/eplb/__init__.py | 0 vllm_ascend/eplb/adaptor/__init__.py | 0 vllm_ascend/eplb/adaptor/abstract_adaptor.py | 44 + vllm_ascend/eplb/adaptor/vllm_adaptor.py | 289 +++++++ vllm_ascend/eplb/core/__init__.py | 0 .../eplb/core/eplb_device_transfer_loader.py | 137 ++++ vllm_ascend/eplb/core/eplb_utils.py | 135 +++ vllm_ascend/eplb/core/eplb_worker.py | 436 ++++++++++ vllm_ascend/eplb/core/policy/__init__.py | 0 .../eplb/core/policy/policy_abstract.py | 42 + .../eplb/core/policy/policy_dynamic_ep.py | 389 +++++++++ .../eplb/core/policy/policy_dynamic_ep_v2.py | 771 ++++++++++++++++++ .../eplb/core/policy/policy_factory.py | 26 + vllm_ascend/eplb/core/policy/policy_random.py | 30 + vllm_ascend/eplb/eplb_updator.py | 205 +++++ vllm_ascend/eplb/utils.py | 77 ++ vllm_ascend/ops/common_fused_moe.py | 64 +- vllm_ascend/ops/fused_moe.py | 85 +- vllm_ascend/ops/moe/moe_comm_method.py | 6 +- vllm_ascend/quantization/w4a8_dynamic.py | 5 +- vllm_ascend/quantization/w8a8_dynamic.py | 6 +- .../torchair/models/torchair_deepseek_v2.py | 2 + .../torchair/ops/torchair_fused_moe.py | 76 +- vllm_ascend/worker/model_runner_v1.py | 53 +- vllm_ascend/worker/worker_v1.py | 1 + 30 files changed, 2891 insertions(+), 47 deletions(-) create mode 100644 docs/source/user_guide/feature_guide/eplb_swift_balancer.md create mode 100644 docs/source/user_guide/feature_guide/images/eplb_img.png create mode 100644 tests/ut/distributed/test_determin_expert_map_all.py create mode 100644 vllm_ascend/eplb/__init__.py create mode 100644 vllm_ascend/eplb/adaptor/__init__.py create mode 100644 vllm_ascend/eplb/adaptor/abstract_adaptor.py create mode 100644 vllm_ascend/eplb/adaptor/vllm_adaptor.py create mode 100644 vllm_ascend/eplb/core/__init__.py create mode 100644 vllm_ascend/eplb/core/eplb_device_transfer_loader.py create mode 100644 vllm_ascend/eplb/core/eplb_utils.py create mode 100644 vllm_ascend/eplb/core/eplb_worker.py create mode 100644 vllm_ascend/eplb/core/policy/__init__.py create mode 100644 vllm_ascend/eplb/core/policy/policy_abstract.py create mode 100644 vllm_ascend/eplb/core/policy/policy_dynamic_ep.py create mode 100644 vllm_ascend/eplb/core/policy/policy_dynamic_ep_v2.py create mode 100644 vllm_ascend/eplb/core/policy/policy_factory.py create mode 100644 vllm_ascend/eplb/core/policy/policy_random.py create mode 100644 vllm_ascend/eplb/eplb_updator.py create mode 100644 vllm_ascend/eplb/utils.py diff --git a/docs/source/user_guide/feature_guide/eplb_swift_balancer.md b/docs/source/user_guide/feature_guide/eplb_swift_balancer.md new file mode 100644 index 0000000..a4d1e7a --- /dev/null +++ b/docs/source/user_guide/feature_guide/eplb_swift_balancer.md @@ -0,0 +1,45 @@ +# Swift Balancer + +## Overview +Experts rebalancing of MoE models for LLM serving is a mandatory option.Changing experts dynamically would have a negative impact on TTFT and TPOT while stop-the-world. +Asynchronously expert load balancing would be a better choice. +We have launched SwiftBalancer to support dynamic experts load balancing with Zero-overhead experts movement. + +## Design + +![img.png](images/eplb_img.png) + +The overall workflow involves: +1. Record experts distribution during forward. We using expert_token_num after dispatch instead of topk_ids, thus we got much smaller tensor shape to reduce cost of hbm + recording and add-operator. +2. Do all-gather for experts distribution. Using all-gather instead of all-reduce as less traffic volume. +3. Wake up eplb worker process with experts distribution when num_iterations comes. Run eplb algorithm in eplb worker. +4. Generate p2p send/recv ops and other operator such as log2phy would cost long cpu time. +5. Lanch ibatch_send_recv in async_stream before forward. +6. After forward, wait for the ibatch_send_recv finish, then do uapte expert map and expert weights. + +In our profiling shows experts transforming is hidden in the bubble between forward iterations. Cpu time cost of eplb algo. and other python operator such as log2phy +would be hidden by eplb worker process too. + +## Config Params + +Currently swift balancer optimize 5ms TPOT with ep size 64 while cost less than 2ms for every layer expert movement. + +We add new parameters for eplb: +"dynamic_eplb":true --- enable dynamic eplb +"num_iterations_eplb_update": 400 -- forward iterations when eplb would begin +"gate_eplb":true -- eplb would update only once, false by default. +"num_wait_worker_iterations":30 -- forward iterations when eplb worker will finish cpu task. In our test default value 30 would cover most cases. +"expert_map_record_path" -- When dynamic eplb is completed, save the current expert load heatmap to the specified path. +"init_redundancy_expert" -- Specify redundant experts during initialization. + +## Examples +### Dynamic eplb +Enable dynamic eplb and specify the trigger rounds. +--additional-config '{ "dynamic_eplb":true,"num_iterations_eplb_update":400, "gate_eplb":true, "num_wait_worker_iterations":30}' +### Record expert map for static eplb +Specify the path for the static eplb initialization file. +--additional-config '{ "expert_map_record_path": "/xx/xx.json", "init_redundancy_expert": 16, dynamic_eplb":true,"num_iterations_eplb_update":400, "gate_eplb":true, "num_wait_worker_iterations":30}' +### Static eplb +If expert map has been recorded, enable static eplb with expert map path. +--additional-config '{ "expert_map_path": "/xx/xx.json"}' diff --git a/docs/source/user_guide/feature_guide/images/eplb_img.png b/docs/source/user_guide/feature_guide/images/eplb_img.png new file mode 100644 index 0000000000000000000000000000000000000000..2888b17f4de5172d38d1a030671d90506837fdae GIT binary patch literal 56081 zcmb6BRa{lw7xxX5(j^Ge-6h?fibzPebayu!5RmRJ1*A(rx~03jyF)sBr(Xa2exH-) z;MpG!bnmtHUNgoRzcI#l8m9O`1`U}2843yt?X9e&G87anITRE$3nC172UqHg1{9P( z)LTh06?eU(OfwCf`NqQ2(aYVX_-j6|5p=iftimY#SJWkR5I7`c3NpUkh^-8#5!n$w zv$tO<$RM%_;H~$o6-^l=)yPJVg!DNUKGQ^8>v0m|iAK%^P7?J;d3*zlPR`uT;oeYHB}OT&==bSUfCe#xVls=8f6$K~ zKP2J5q6?aunUM;6*O!(asUrWIt5&`1SR-auRz^m~!0qndUN;vPGz<(0q`t{O1Pmr7 zCTC~o=9U&OPtP1vD=S7kJas+2m0%RSoaVedDtda0LmihhTG>~am{6nOdtYN?e^gR} z7JtF&si9IXnBiL`Ft} zNgJP>4AjjO@>0{$A!<_({re;j0X|zAR9{T{2L^U_cFN1kXJ%$v-Onk=$VA1&0{HKA z6%gWi98TR(hNKbESY)mWU;N?X`hA*a~qwDMMM~VgysF; z2KBP1ya=8}m{PEL}k=IBgWf_XSPI^yEu`uzDbcyta94#m0o z`696PK<;!CmC>?EWD;t|&N6XBPj*jc=YhVgv6~BH-eZ^#u zxC6T|^x>DnhY#qesO`#qb*-%*rZn~RY|YK-7#Jv8hw7`VCnhIhJ$&UO{xhcjRwt(a zynq4}Kkh%<>FY)K|LhO;G)S}e_bm>5R*=-x)N(NG!v789A;f1_gOHH$`1lwMFz9?} zXo!=W8yyWzRaG@VFAwv^EF?LaRODoDPeV)VYdJ3;pQf_1vZf|dr({CAvU_TJy4+Jk z7g+E1c2Q#D%$OL=uW-gCg@t`HGq}iv1O%bt|5>Hyb+VC-tt|}=&AYd6>FMa|8ybd( zhh1D;!0x37F9wFgwp^yn1PJi+u3r%^QCM4|jJ#uj_~ck-r-R;$3#c z+MAYB|aaYC*ZA;5)=0i4j@`$xxiI6 zH+zF=0It*C{tTGqhYurFM!gi2lwhXFyTDAv(<-!AZEbB4fN`#F1q6|klgD6|S{<_f z&lmjNm?f>P&$s7bw{D&_*VL$}sMLJ_j!@b+I2c~|hM(W__G~?wRsSVx_?IuX%Z;jf zdMs2_aLQ7{KPoG83JVL%%4~q6i4G19mh@0pPi|@wpbE+dvjz-1padTuUrZb9Xflr} zs{h9M3yhP2g{9!zH*74du&5|*PEI&K0zyJq>WiBjS_URoR{d7*`K2YIZaNAIaSe^v zxXjGV7Ut%aRaG9Io;1|d@bK`B|+kxDS&!a~F)I^Y~K`p3u5?=E*0_jY!`29Yki z{JsAXSlpX80s;bKlpBN)GG#DhisshV)+Q!gZ{GYh_8EAO(y}r>US2Y7H@BaqrA&;B zC11a845#z>-L8lH{_Q-*1lB`XSeT2ecJjM|ev>O&O~k@~W}(jrf?h&Gf^0$sHVX^O z%iy!1dsQ)*GloK_=FFQfr}y{wudc3&is;Qd z`}^qy1)CchJiyDwrmUodk%3`geja}b7^u0`hyOOS8raOLYHG#B#S=-sP=`2mN&eyi z>NeKaV>semHH?2Rnil~}aj?Jt$;>P}JNvt?|9{azX6`D=VuS zJF79<77hv-@;F3$>g#Va1>AOacX_zEXE3+i{N2JcGl^+wcl-Jfj~5aDYM55n zmDMV+AIy`<`wr(87xfz);s1HF`4FIEnh^i_xN18O(%!h37^vRPrhLj%xc@Hr*&J=O z&Wl?u@ozq2d zK13+JRWZbnf70qmd#ox(_y75owc^Hq9}P_WLlT(v%;>+Kth}tu$eS3t9XemA$L@av z_<2-rv|Lg2RzYrHNVey8f>MZ?jg<%ZnuzPyZLggFx^d&x>ZYjtiV~6|2LT~1YA$8` z&B4eTy6t)cJ}d0=fntIMqer#>GrSZ2q^u}ug&)`!DfS0>%atQTGrDwr06n42h3VL8FV6@_Fr|9%ec?;u|bApZA{yTBZyPe)|`?WQdZ zFy5-b99}i&m=7k?KL3v&1{2qV4%7QjOaAXN(Z6DI9vY}B5G-o)`+QAqh)b}D4+k-t z)qviphMk(8-rU$g9ES9Y+%LW+jRgkP)6)Z@-U56dt?>;pnBi7Py91bm7-BMV^7XZ~ z%I4(hed(-CBE=Js=Ro~RT_&Yg$O&Z$T*qE61uC96@G!qdC zI&Sq{?v7e8`Y@~!`8+?~b#{vXdpHc<8A(hFh7?ZX_r4kKR`Q{5-)$dn+hNEG5JL*C zp8q|?1xr$L^4;yNxVShF!rEF|QN#k*_##iDw?IDt=iXMpcU!4cBZVV;I`#ACnI2_%CGuH?l@jk?=A{lZUdvNA#=GwSqDd~po55%0NZyT|R%LJ!~g!`5*AG--1 zg@=aa%#&1%bI;a(`lG-c{@1#X)Yq6mgapx`JU18mUK|I4jDv$ys4V;TEi{t9_-*Ot z))ollLA#25BO|R`qaQ%Ty}G#}B_dkVs&8y0z{DhDVBi-LLL8xj(5jO*P_Nn$}5SCo>H0+LBvyQ!iA z-jG?+d$Ev@3xQq&S4J70c=^YO3y5*`i&4Ov)0!4Ak0 z4Db>Y6884?Lc_v2NWFB7L6kcSnFTkHgzlq{{RU^2CcSLXW zbub&V__46NCco#vp~ynHzPKpMKRbMD!=JUxR346{^>@pFm|yJi)s1CBDK^f5)!E3E$2d19ehnW1nP+BV>iS%t(=6*^)sFb73BCq_?y1kl?V{|;ccg#& znOY%B5PBU-2`Zr8^Qss`35OFP=LFnNh%(T?ZU!+KBwS#hBD{DZ%Jt>TmmG1BQ6vPy z`P=(T`+GaPxQv^wzEF9%UbG7fJ6OZ*K2wmB8|;s#&CJXU`TQ9b4Gn`t5D|A{dmBC) zk5vy98JU715YoPTT&DdKq`D==#US}?sIQ-znp*SrIhdk%SJu-L=H`BpC>EEKO~x{& ziitzfb$iPT{GBzWtg@jYh05Oyt;`9>peq;X7VnqnJGM*HH$%iD2So#at}24=Kd zh(X676i*RCFpU!QMH|s0Iip&j8hQalW^u_NQ4a5e&mJ!>o1L}S-3k0Sg$pf{OrCU! zlhYWFP&KuUWQW{@HR3Tt6V5-&5&@~VSTs5jZ%jf0+z7PR+U~B5do)Pd{r#abE_Qc! zDJdvk00 zL?ff1bQt9b7Y0^Wvx|s`WH=oj9*$=4eXFQYh#(1g6%ZxrgDG~KVp0G_A)dFev=lVs zcfbFgQYOmS#01MooNB;5v9OTtR$os~&(~L&4xJZ2%+ayU`;NWYd+>>-JxKIQG(E?n zGJkE&V2o*_DSl0K)$flG6_q6f9sG1)IhH9>BG7y}N|-czNTFm*)Ik{hOJid|A&)OC zAnOLJu2CPE#V~hr0VQ2jR)&UFR9p=CptP&oK@<2zH00Btr8d7mk;#-I0zhSu@<(yu zd9p6HYlO7@7>TO!oE=fokP8-I{6H!4wyAa|jgxd?aSv)nHKkZ0EB$KGLfHOy!{Vrj z536V9D|YLM7jPc3UJK!@=nt{DogGr;I{Y>+yZmKD??Zlb@>c#B4@0be@8Z$?-6&vh zC}jA`14Gb7kMP`2Rn|!iJ8P2aJ8{Uj_r!tjoAB~!{wkNBth%0Av-w$Dev_*OJbTTB zk7x2yb&papwRaA>aFf4J^&YopN#MkPkM1s>xq!ji(Ow`D7nBQ>PxT-7>mz*kM378p zWDt>(W=BS%sm9qZ)k7zV3L<1fbyZeZ!^rvH2RVUg2~kDsMWD7b23;%zY3yK`7#oA?4Hl}KZofv%UaF#NTsw6X=v0q7g1h`wSJ0WOS8Wvp z!AxVvL%v=!bqeRiJ+ovo`_f!<8nziN;%#O^1!omIdjBMh^0boBpegooIm>K68m|8B z^UN64cGDM>Ps3a@5oXblerAlvk)X0NF;(hnR4i&Sw40$ zHobUQ==kFev53=sWpFN8in&As-|jYt$n)*;&p)?G56CT5+ zCA6<_lRk^%vk{Ep?+G@eI>*0@R-$doGtEL>2};Sarm4l~Htm!%?CQVVszaOo-`2R; z>M#2EUHRijS{j=6q(Ut|a`tI%X69N8cG29z0%ZAH>+8^N(3u+>8~m=p9?Q}~R~)6>(=&Q2o_gka*F8KA8EeIeIk;bW3c zPCptO`89TcYRtW?jI8Vhr94nI0`kTHLEO|{b1tK!Lu_t(t$NCVel7q!pc77ex#*6D zrlwfKn!V^9NUSa{*oolZFz=x{hb@G(w9~qk7OkzTsbdIKdCh+|?oC5+sl>#>Hg$S~ z*Z*alY*d?+RzS@rw<`RqtOM6HWSvm@iA^uuF*f&9%RR+P_r=r0vhpQl)nm6Q8nWiLPd zD)RjPfn;lwKmB#VMa_a^d5GY)l}X=dr>yMBHMOI{jyNe!swhU_Qa;K@=M!^%LK`Qn zrVh`8t=z4qVq^7m^v}ZjjiNGs_iujd>s?F?p(refv0A=||2eaeVJ6f+;7u{4T~{>W zc$UH+hOGEz7O&oM-&wUBOLk?4*V^mM`R8{`(r@Ji7`?$CHz;-`1Wj@l#{-ae zH9v?-;LU&FO$oyku=E(i)4{DfnHz^Cv!@(g&MAu=_mABzuUxO}vOg6h9L|9nB=bS&N zsHg^7pkg&f=l-&uU=MK)RZvS)6Kk@#Js!AEvEfQ$4Ruh^P*A|AyJ-)Am5Cw>pe3`c z?3R6 z+SIpKJkpKi{j1>!EYBS(vl--P4{hx++dNjrH)t%FFFwVxO*VYLw=;Wo!CsI5Tg8aL zQztCHU15j7`13`AeJ~T_g zX+}L>e9UVTG{Irc<3B}5*<@23nkP|^`*z4st!~hj{K8wORI20Sk4-KaEnTs}$7L$9 zN}8sn29|6KKXb1yCu1(YruNcbR!tP&>HD%zrod*sE6MjPvX-Y~+NC>2Q+6QsJ!W;SB$$x|X z1Vj=a1C|Fy2^JodLQ89EI30YP3=GzupME~kG5?lVR-Ra0&6Z4^g=a0gBX^RL`ZY5U zBoh@CCB6VEVFxS}xE*q1VHU_!XwYcz_`!M|wx2$I8V7oO6>{(|5Ciol70q>cbg zTwIW0(I~ZkM%u1~%_SoFq)RvdF&V{ewaVNV+mpe4FZD;$a@hdU055KSQ0RoS&-JCV zu~JS1s>ebYS3-KgQl4ZXgn#Jf;q3Aa5$_qS2Hu_4y8GCDsK{Pg&FhDL>=1@(RK4Y2 z&J6C&UAlS~aV@EDEUr#E&1kx~|=n!4R#rQXdsV zIKd;3AU{&TEih8`kT$kr;{3$xC7F`#(cEQIytug)76^QH?qUeWQ}^gF9l0Y3Dif_< zd=2Kcb~0iwvNWj27_UO_TH8}9Lh~MC?b=}ygX#$=kS-LT!XykXFK6D0>w_+iJ5nEl z`8#!Wby-=&dcQo7BpPvwe2oH7{GxgRRIX@*vb!K{Cn+T7Ff5W{VC3dN8<(fV&D(P{0mfEwI>s;@R|7W!cFL{h>+Lb#^iERjo*Mu)Hr9y``5zK_E{r ztQP^1TF2IXb0(9W9>3o^iG7A6(l>)qGIdMpBe(^7F{Y4Q*y|k%lqX>ZM3b88WwzTYjxm7P$~(U8yS7W3f*<7} zu<|^a-O4t;nZ4(%9=DEop5SpT7tZ^y_5}|O9Z=>pdV>@n9}i^5lzDJ#2@O1eN zo>o$>-Pf*F-BTz>BQqN7ue1hVS)sms)HX&fADi7WE?`wm_}V1A!OGk-;&K-eiiv{o zP;~JgHU`!A#HluzrX!1e)%y~Udj~u2E1o2rW_e|{u*2h;&G;o!?z_vD73qapjn~oP zNI$Cea~NgEJ@l;5EGO(o4bSsORgw`>5X>jTJwt$CoW65&#I8wW7|LG$xzpl#0(008 z;=q2Pec%fV27UNQ*wH{5Yx!Uhwh2#D7gLe{Y_s~=&~cP7vY80Gpc=WplNP^Z^^uJi<7m;7)|#z0;u5{GmbcB|_c#+AL=w}J2G$aT`)E;|J`t}ryFwc^ zJgs(v5_uV4!&r@$+lonweXC;H+1eFL_) zo*)8@fxc?evr@qt{pHekEB;NHuX}%ev{15qQphxbo%}Za!;ni)*rl;!)cf-Z-iv!m zGP9OQ^b6%=?@YVsx~2}A?@Ovp*?ux_@7XmacDqE6nu)0UV2#>_!)1E!kpuqRa?27P zVx}KjRT$bNzGO@<8BAxyluI9!BODATl#|0e^TvKw)zO^S(OlM{wk(egY-XVfP>HdR z&v-4?>R3}-3)(8?a&4}-czB?;vntF=GC#0uUmBJ(eBN$=fsX>4+e#yI_PLZdvPO1# zpM-T{U^!X-g8`NO{VEX(6rokK*}5@3%*vRm83Ifd|AxN0Paml`BvNd z*W9zGx+z~b&B-@5z0*0d<{r8`<_`*fB<@Wz$Qgzf3J&I6#%5Nbw-rpNa`=d0FI!n& zWWWeU?CwQBKKMOHS05$L`(}SvRf~N_)^tcS{+WXOnP%X}?+$kg@2z^4Oz0badAHES z{raz*2pVfWl8R=T#C=~|UsU}3c3Z+|g&lDCt-|&M>WS3c^}yR$E)8uy1KGB?G89GW z3zCW~sw%!MHO4|wpDx?$I;xT#?PkG?aJilo&N5-(aY_T zr0uAk{JQUQqv=Un*)f}%o62%s%9oq@-=lHUNm#gNSU9IxrX<>(LL2i7?(@H+$uTRU zF?2ChQk0^E^X0|u?D(q^%=qfce^6oAY{*3!e zc^M=>AX`jbX(Wa<6x6gne*X{|9X-i%yYN;{4s?eOkB;(m{hPx;y$_>NO}1O*%)3n? z^m{~6xoAqXj#J{c=ntn&A*V4K%p557q6T5oYR(9ylU%6#YSGkw$=KH$5Q6y^Irg7# zA?qdD8nIS!-z{U!X5y3MX1}WV3p#zed|D~EhjA_FbbH#u?oyrjHtmRA`{kp{laB4| z)+#yU@_RauR|Apcdeo`kmOdB?ADM)sObta%HtU-=^GiBlnLD#JGn__|u}T;P|0==X zNbuNev!P*o_~QO-DM81Ek2+)ii4Nz0J`<-P4!7t|#=HwvCk1vMV;p1CYT7$vTD`Pj)aqFO_`4d~|0p>MF~uggBD1EIK)R zhDnG;zm!>{urH1EL;5gnFRD$R@1 zu-H3>P6G1w0CU?n(k-!br@!DlAc7_?7d$)`$*nSL88@S7-U;gJsxng2cODOX3;Y@> z8oUDh4-a?2!JmU=q=ULYe=d*;dLNT*@x)8w6@1d=^p`v-vAk)if?4tW zsxfTZh*6168fi_+dX;)g^-4%eqTT#l40^w`q@GLaliS`!35F(Kl*Kl^kE~ubEMf|I zITe?ouJlfln@jB-ozgo%lMp#3w0x>q+TdH);CpZJG&R|~DIYD9HHF44d{E*y zrmFt%s3h-n+|rwv&t(FiAFSyyPhUiT?r>f;Ugf~%Kn~rpLt}dHc@G^BK;KXQ_U+r) z*jSJXW@Tl8;v`^6x`m3c;KW=vd+!D8T$Pp55C5n#LOtx+$z@CEOEmibH_M--8y;OGJEqNX7?oFbX{j} zU-{@*ZRb$$1Yuq4U{hynTV-uOW!E-)(=_|oGJW4L{=_(Q2eCw(YCfHIF7?;EsMa@G zr@%KKTh;{lp+9T2-V^9GTApZa+16V!G+EN|l`!%bG4K`-2o(0s(+@5%^v}@^%+bke z55{NDG4%7%4KC2Ha3^DN#b8e0tbfGWO4u^RTQ_KN%{pg5fknXXGNDmBK}m+^E&p<% z6OU70PR3NsLT*%=bC6SXXk?6KbC|LHDf^^&EpLE+WY{ZE{li;fe*VUNi;r&VH(dxA zfJgx~?&YN=P^P?IuZM?_oen?n&V-EgF1SAXUj1s^XXYVYePAQjNwn&AKQZTE+qUuw zuAm>Q66OQ1=;IOB*H z(_fAg;7wufYZGqxj$&-R$mn68*JIs^s*Z2$CbodiC(JNh+O5zv-Z0J;@w?_c#J>+m z9nvYqgJ(H^r;k0I81q9FrdRo9YWkXE41=2{Z-8@TfoVNaWj>D&LdCb$7@J9-eDvN` zlcf%SgnNHvOlbO~WItH&a#>sn!$!C}8 zN0%GgC-eITYrA{%=LS;^jnHujVPE}_twDedwA~9K2=uaWKP+yv48R)Dpj#TlZ1GIT zTZ-LgV4QqqdXKWZ$C_o-+=(iV#NE%_@20CN*(pmPsNAJJ%DXVZzl=4(kLS^t{NOnY zcPE05V)-S)JUn_|KHw>pJ29I}KjVi^8k;WWbn2Ea;g;Hg*_O@Myt1R6^_kgKgb*Va zm)ajcu-^|ZETnmoN`dzC&r9K`tKOcfm3<9;^Y?JW*Y3458WpHYw%Cm=owK01k)0$H zO`_Z8oB50?R4=}^8Jd?{T6*{L0>1v2a%Dhi!^Y73*Hy#v*T_G1<*M=C<-ml_mdLpz z+$)mXu#1>b(S9PMtmEM-W<*+|sNti@qi5K%aF+e(d=@WTQ7rm~1YgA&Iv+2@1t(@jZR@)m8mQbwP2BU!&t)$!`w`Fjk#yp* zn+wROau6MK74Ke5wUxVGeohOGDMl0wc^M<9(Op62EU$N#mH|~!3Kc!XMM5tui&%7P z41fEoLPGxRdwS!B;fuHpw>;|Mtu33x#7r(uK^M^SURGASTU$pR0_UKoKP%dMUzvh>AJ?<8 zA^>O8PihsR*+n4`F|S1GaJGwP}un%wObIe zLlwP6g>FWRZVHM2)ixA*st^rwrsKDf`)|WeKFfYNc>m?dUp7yADrvVuM%>t|(~foV zr}`}yKQ_6Ivl}~~UY@Aan7NLhCrq~}jq?j8>2{zM9y6j~D~HKwh#O-oVUV_cO3G{WM?x6mJXHEL@rP!Z|Qv%7u$xpbu6H z8ynlu&~SNqxuzEbKq4rAdq*C^xIHTv%_1|jd<78%9QHXjMe6qMjTF9NB7*?nEzQ6U5+g=IAu62xxTtCFDduWz$-@ zTH}eHmhe-v%vO1V)&0Tu@81;^6c*>_Yk&TnB>78JA^$^Ez_ZE&#sUCoqwE=x6$<_O zzr;#e$>LaknMzQp8??TA%1=efKCsnz zO-fXR?cpix_A>q@=vV_z&H{r0;3p>nNuFZWYEHFMAM^Qok_i<`hDdSoJ@^OIRn#U) zHh)5*u0C*cb3!mzOW)Im-&&Ai1v7MyI6~5+~;>SoueR-QVA5F7e(R19T367GnTPi(#gEqoJ%&kfI{* zh1)yV70abhG?_d=OTM;!9NhLYlB|z#Ubk0EUteES^Vcmu%x}mwR9m>NOX>Rh`t!5z z!o+HBOUu(jtrcik*Pzq^Tu$tfY}vI?1d51+gamZ=K$N^Deqj$%EWH^CE_a+OSP_dQxhfPKb&mg?(`MK zLq=H?{7iLyJ>tE%_Wo5YNg6qYqQ?OP3JQv%ldmty1t4KS=^q6NNlHe>%*U5E?$@!lS2D}DX^`NdRF&={TU%c+1HzCvYD zQGY;hOid*e741Jg*tKFYQbB)$(5L{QR7FJ;x?^|ksHC)%h=AZ7$O;xiUq^7}i5Gq%e+ad{@tPu!&xFis8*4w~B9z#AkbB(zf~QpaRLATX7r0s>7y1_Sm00L9=rLBPRe zeZ$Ph$7f`;8IH#aIq`oDkVV;d?;wN$+x}_(PrL#GliYXw0HNWrTY>X~%=mbF12P)$ z_+yikmm9xO0IN(#D@`Wq1~|4HLId}1-pS5RNMz(NU?xQy+9v>4o1T`ILv(LvX!w$Z z1ii%uAfk45vlA1L^6Q%$iB-{AFjnBxPgh#xbQFR_unS)>*VWXtlcLukDuZTZN_I9z zI@COXFH=X@^jiB<*>q#LqHk<(<0#CkICI!^-m7^#2R-V;Rr3Tm;d=Xb_w>+^_sLL$ zrD12kh{Q81{69P-VlT`wloLA_7ohII$N}sYBn3k>0n8}uH^@Di4d^r(NC6i0OcJp2ying)sa|&j!CV7^b7r;^HO&ib8qSw@(ii_4V^H`LlvVOf+IfB7m~T zt71l9tZRVkPc(_p#LP@xRaI4GWoT$9R9iqYAaK_34YdAEOwV`$r#>y(X5g}iYE8-3g~|{6XURV(0RH6@T$ql&eu&?VY8pE7Su-RrTu^cIkN8@Nn@S&W$B$uhmK0*O({D1n!8Auu%~)*)z+_hd6mD?@0zKvF zCK@3gig_-QW(A+#s9`MfcsrX1q-{W024+iAqZ;@8 z&g;G|h?95jHS(?Ksp6DsA%)Xk)Se_!Hy-r|j^t&NOBp8dKV1AY$i6gYZ)|Vbq4al= zx1Odi7shdO;>Fnv?@#5`h$6_42zzN_lk)Ql_OEuWI2lLIX6McO36WC`0(s-nzUeerI<9EDn zYBYS5iz-BY<{iD?ah9up$FIZWk3Eim{NE0}S~TTdwUL`^HKzqK@8xQ?Z^U!GpYH8S zgQ09nc4*X`LWu9g?{PVTfoj>Bhv$$}JIiqDNaIuc-DXE7G^GjzG2H8<|GN{qcO97M zD$0w(z|pek@UB}eN)~QzW+5L-%P-ewZRq^CVHblDfX)O*25L~~Vu}i(eqj1 z5i#1Z|M}WZO)XQKk+1wpJ+kDdM;{)Vn_yl-tzu_;M;@MW!Ve8(d`H}OKTO>kSv;=p z3@dT@cRuP(2ME~x#twR$uss^$%Natunl1;ZjpCUQc3^4-Z&)z_vtshVF)B~Q#p zV^!BIe=WI6dlq^1xur88`3X0Q7^xfigUE>r7&z@Z$ z$lY+4HdZYPiUkTSMwdlgev3*@QelCql?n+NQ~lU0^xv#1^T&ng-)AuWj5JY}c3Npr z@8{bMH`2?KU5a(3bL@2}Z9)C(5ss3KT6~B~ZlsdS6oRaj)Wvz|>xefGBes0wQlhSa zUbDdXbC_cUZSWPyu0H@2y?VM??OFj`amdSJ3vlv7=dBVwCp%LEAEnj#@^@EDW7dQ% zmSnT{cw1#l-}s^TswTB$WzrW8hZAdB(30)+>aha;>m z;!ut@c_{gfjwyRs9nAcICX0MFOX$J71>a@NI8lZ-)D$0a=d9GSyo&x8r18^)NTnB(n)WRkvKUqd^G(`P0YdP=_C zIs6P1A>aUlrlqC@Yvg2a#7rEM^yd510}Ta1p$8#~=>8<+lm%I*df54z?gqt&AX7N? zkCk><`LC67YGEW;`~=l6;@XCw0yNH+4Z*#86%0cX5;%hyWrAsn>;fuq8s*h`@puu( zKF0;uXM!#it3UY1co#a!M}-_>9)F0lu_}9?Akdqt>9%N}$IY8hQc)TF(o~n(v}C~y@EoQrc+#JcXu~{vLBf{>LL_h1KA1$lG8+c#qXP&8>KxU zYWDX7{-m!P5hH^$Yag-%?`GScul^`OO~R}~Xn~!8ObxJvZV@LzK_SL<+h#1y4^vrh z@l(z#P>i+=a5-YC2c7xc!p7PAh5e58jAba=AY)-pwDY$tL;^lK!@7(v6s(`#Y{J22 znn{T|tFt4<#1w}nML)~iIM|2;HXYlg_oJg@T{E$iQYUH0sZXfjMg$3xN#bijXu0=4 zZ*^ylf8<9g>#TtjS*-taNXQ)>B5o^5J<@eC}B26`! zrN!1Mw=&OnRGovz+Hjc<#6qKP=SNa6#I^~82@`ja>hR!L?}Si?^-B4Sf6adJ{oJYVxL zNED6$ihFQyaQ(~B)bzz}#LSG^f=F%)vU6>5@xhI6Q|4L=F^VuI6Z6n-XONuKylI^o z8&|uucMHI9TIgznwI@1UhPHgA#bZOHMrO8Na-4{x%=4pVgf_e$Jx*P-Ldo7%<7tV6 zr;M0C3DDyJGX+TrP~jjIS;t-iea@gxVr0P2f#bYD*(=I`$`U|bLLl7mfW`#yDM*IE zmQ+OskSQRp-o1MVL>w{>;IOv0wm^x&#Mn45KR=%iq=hdPawK=ujp_T8V!n$SE9OD* zCDIovk}XgWsD6hMBESFkRxBLGId1RGZ=!n)ZRm6fYXWOZYbI-6V_y5U_SNTWp3W-7 z;d{d1Y`SFHBgzX7e$2=p;hMsn!^BK2{R@^CMY?!bUi@ zNt_9ZwW3+9@C>lxm&+Kd<7}CZEWhDiQU1I=Kqm9BuvTEO4i8PzP~2Qj!J}8+aa_}0 zQ(X&94z2*grK6yz-tzgf$?tvz1Nx!CN9O)^K0aoEq92`|P)0K;o&+L=M5l)^9VlGl zwMWG3J@&1WluAKG{gFt3@`!XyQDZRE&?3&%(ZCA{HdVW~q)))LJY^p_e2W-=Z|k~6 z3|o=UgR_@+L3!Za%2Yq<%JHGqVYoi;nZC^1w@4`)$?R#b!13QdE$#ZAU`ZcOC7V(@ zy##O{wezXEc*fVc$BrV-ReQ!bl6M#`MHQAmfBs$*;)jr9)Izzj@YaB1Hm}x~PrA2< z(mNvy={@7Y{QY%WHcgzTY?^JA%!e!oMswKJ1-(zF-RQ)FD=W_v`EqA%JHXYNi-o4N?YCMs1~34h=b{CeQ4c!V8Rsr4rp6srn}h?o|v zIV7<8zAI$Y^pqnu-S>(-J3QVxoYm1PWPuaFOKgqUm;@<{y3N?V1G)^C<>-c620O8h zJ>C6|QJK&>RN?F*Z&`0g@YOs|({h#)FG?FIdKlj`*ru(IZlV(lY(0>1Zg>b1o=R0R zUhwQE4S=FOMz)|~_U85HL&I)VhXGz}PTB%LZAH!3#&qQ(^1Z8MjRH~w@6m&3Y$z}m zUU4MA`E97lXGJ`Rxl6Skm8-r#+A~|DFRcB_YDtVi*7{u;S^}Ox(=fFqgbpjTH zemtIW!+>krc0_TB>4BskTM>cII}Hh(8u}XQ7mxdW{ghd+o}Z&TMX6a)oFJZAdfa7SJy@lnoe7rTn^R1!Sct{$te%JUdSi|7rgI;IGqIrCJ<0XIaRK^ zp-_RnSEsknGlcVHR>5|E^vHRWG8MY<)-}R}sC@o7dvc=>&b`~SIqT}rS*S{x89wF` zk*5l*-<$V)h(675O6&Mjr#>f8<<#4GrvaSJng)QW(9CB<{@f`5pj=mX85HF-YO1P~ zK7NFa+9upep4G0V?A2nqq`QG_vF~$`Bk6xjJSvyXAWJmxj$~x!_r$~CpJ(`lPKxmv z;l*Pr%1qxv2Wz_ntVT*#&GNTh^FWf$-LiNtU1E3>L1)3e4W<7`ri{rKp5MLAA)6v7 zJ(9uqG2w>_gIS5Q(wXw(AnKtrZ4cv++afm3k*cN#YYOe$mxE9ay-%@K%7UH;=I&NH zDrRxINpb_`wi^ZoANcK&>f3L1E}Z-vKb2B{vLl3%OnaF)M2>su_&Fv#IRyjMFDgx}Bt*o|XT`pA0VlC9I>|jGLuDDkQtU8V!0F z0PkOb_lnEO$?^Mh<45~BBz%4vWbE#8UM6EiH-zmL&k@r5L8r1>J%KH%Y&5G`qcwXIIN& z)*{QNm+;%oYOV>8&nPHv*}CLq!H{-6%jQudcURqHo2g^X>as^r0skO%A&HgDP~_~T z@_W>k`uV`F0m^f${tg{c(;_;N(#9QRt`J_ zrpbnWJE`D1{ASZRs;<8nH!30Ac2SPG9j?e)#(Gptb8BWyp-B*gF-`~ic=c}bUyw z-&@M9Qyjz=lM$B>xyIoJx~xKy`+mIUx@laiIRV%~MOBqN;6_r=T>nb^Hd0<)&B4Ov zc<;oB)0SEXJ~Cc7Ijul2_RCQjYN(CMIbAkTYfOp%!Get|!hi2^)URq|8DMlxd!`+fPEZDS6D1*%?FJtT zduc2ZU*}UINHh(XPXUP_Ak|}(&S8+lnWm_&L9P-iZw&YT!LMuKi;;bA-HGXL@^V?P zJzspcKye?IExN5=HG710qSd=^Pp{`QD=-QzeWylyw|pgR!i{U9q;vbx1cQWM52DnK zE7j(zk;N1TE#2GFo6fMg(@NzYE*mI0EJEDMoX{FewK%53Kw0fb(K!hs3^4&=(c%1O zKM;i1JU^-k>~I-wXsn>k+Yw9<|6`0~HKx-HW2^(R-XY9!xJPa~&qz2@_|YwvHOCp+ zserp)j}5Sj0t%WNV#IGo4Qh2wO~H~tfD2H2XSDFo?fl+xlX^)>NefC!C@loJmygtY zht)yc!Cwy{F6W;kr$dnP4Q;1x&uHEAZftSh zi_SN97seUe4TXTYQiuGYzft4J)aej&mvHcFws;ofFXAub0`X=}Ek#2R}B#@SJLNQPAg0MoD>6s1fwB4seAgyWcj= z0r>$ErU-jI)gD-Ui?Tx37_Udj%ZFW0?uX^)GTv(bs9SLEk5}4fsaYfE zWuCYI^ylZNRt@^+Gf=(%(?@}r4>L2s;cl8gz2#U59U)B@l$b&S*g*^GekXe0_V;#|CEbNMP)h>olkf&7Lp%fwJ- z`Plw?T?lWO>{LG6^L$z(xCLb=MTUo^c``WuTSLI_6SK?0$lHUJB-vEy|BS~!WiPuP z67VNe@0Zrr{0dKANb-A#j*Je;sK&af94&QTrtu$jr<+lSf}N}rvx+l@IkllA$iqjn zXzh(io`iR)VFSoG4XLrPR~W&@YQi{#D-#S2M^)|c^OXFcfQ7mRIOXB1cttheXUqMq z@6MI6{lEq5=={kj&nQU!`RXh`yrD32B?e>%g^6D&r|Zt*6Igr-J~#ypUh)t-j=J$t z#!4XzfAqu2%dffkF;Z1GsJgjg)ymALAFl-P_ zlKIdOsz~?g4pHZPBv0ZWO_B}Q;aq1M=iP@tm+&`^QUdc72p;UknWEp@g5c+EY$m52 zuwmJD;VG{DjA6&`WX4{+9p&JVnp;=u-DYbEX~%s(S+f3qK&cdMNkP`Hw>7QeZZnwEe0K%O zvwS7)d|qrOjd?8JmUyggYoT>NaaL;&ogs$Sb|mn=TLRA4vSgh=$W_QWn&H7*dw=p; zB+gNj%|G+HA0X=4UB;aBPNzPs^5J}Q^Jmt(V`5BX8H5M z8ZXz%v)cmm#WpZ2K!MwYUD%28B>h79y!W06j}>R8R=1m`7(#?1-{AHiq9C<8q*zmB zp3x;#ZymrGp&g)I@LkXz5q>C*tl@aj9=fhp(`{H+lW(=r9qfQ72B z1Mj>*Y#QuU5(}C-h;k9P@0zK%4(Q;lw{?(3A+rRU2Z=l9=kJVPcZ`4;xeLe{1AhGc z1gGp&57el_5Z%dK(0&uJrJ9n_g6HCP!NwHUTq93VkHIy zyVdusHE^$U)vaCuF-?(k5;CWh0kbzU<)H%E2zJ``V;0b*1a;Eo|892}b&EE0k&&Po zrM%}gbG3KjVdd~X@K%aQav0*x3|-v~^~k0~e*p*%glc0j2f$uBArzeq`I%Cvcu(HXvY=l%60Vqt8Skd&UK8X>wB;}>sV zBfe(b?OO^Io>eP1)xS%B9qAKX=Q_ZiJpMNPooL}AtMk6U>j=lx%@WUS-*&y2FEU5!pe?)#K;NAVY=n7R+SzWt&=ge}rZawxsUib3b zytG~CfD&??<~{~e{eA0%!0!#J!h4XTeQ~*phzuN%;ig(S8f>jck6OWl zSk95x6v*nyo~zq`m$JW!^g@vhXE&mp&VHVv>5+=v0IOYagi~g7n7%IN-oKx5fE_ziL*lL+Wg2Srv57=q1Y7d+3Za*>%`_VGkMqdG5V_ zGW@9-wd^X?ulBxuGC6vWIPEY8e02AUTsX8$%McdDxUsQ|vQe{}X+-=C_l|YR4J`3L zQjR@T&Y-a#8aLLrM9SMJUhGIp1-41q#%Fx5{!WB~&X%Y5p{Z$clj84QVaCs1v^3>e z6$GuT>4-s0Pe1LXSF8ydMK#rYtA=TRk&+V^?7ZRjX(n=fst_mtS(0d-rA@-pE1}&# zzZ{RPQG-$KIya~Sckp1-fP* zSAEq+tuf05mL@X&nwBDEF4nA)`Yv&OYnr3}=cfCb+?$xB_xAb8E{1koOL=q|8JapF zCHmpt*LG?@$3gqRMxKP`*_;nh)4q3Vn68qDZikG!y?~wwKE}x0e2p5%)^E{#Q1a-XyD5Kr-s+kKHd!p8`Nw@Lm{sPD&GvfF&8hfSGR1r_dzd9o ztiR7lu>`2~(m9FNIhlQaJooq=>hn@i4a zb=)k8-2Kj;#B^HU&tbtm+OzF}XDpBGo=ekYch9pRkIfDy&Tl2IZ#AGr=ChXb;loQ) z6Zhqazc(d6G|*d~c&(BNQyd|UI5U*dD_Kz-=q}W>9549P_8h^ATmA)aPp4>D=7Pmt z{4f>FN&ADP$&#qblk0h;NaP<1@F=K(4JV+aK1AJ?Y1gds8yS3Ne;XdRnBjb4rLAfo z^44Rni}GbMQj(?}C{ts#QO<{-6%%>GaBr-MM&ZLd5cOG78eK$twXrFUZP+`}!s|~$ zv{J_l|4Pp(UA6y7f5=SiMaawsrxjqkE+p4t`_NhXV`6JOWN_00&LeFVdF^_Q-bXx1y zNDKjBBCAb-`N|v-2A7}vsS|!m+3DK!^)V7U&PJmP05qb=%^DXP^=jIPgL~FH%BItixq6bU_5oKseCiO z+HucvUOrqjtkMMT0K;Aw3}TYu;ao$s5R&+hD^6axxhfydY{18FP5IZ7JflHhLpcl| z4^H6mG+tMhEhMi1&23WZ9>fnX$8{~G6yCpD^u3}(eaaNHLvVk?+6;I#iQ>lKLDSlBi zcr(!aSm;*|3_UkC_QzoQrP0mGc`WUK%psDcv@#JE{CNPp93OJ)kIZ1&pyrAy{>4Rg zTxb9CCCx$21v-vtN!)FVVG&=bdcDTw^qh-v5iu`?!$&gDb|MVbl=_?)3BQ5u{^9o_ zV_&vEZhEsM9gdhHabM=_H0=JTM?^Qnd-RWGIkrg+2Ma4MHMs@WwX;FD>7F_!Jz0kb zDk|BI@buxp1peMOU$g&VQ-krp%DGKm}hictY2LbqKL+QM*T5) zGK&3)1ECa>3hISoLs3*VxaU|E&4zq^MjqR>#Fe7I`vAhIx3aG|?R{nQu!#HEGm)mc zt?my$4TUoT6D(m7CG~i5UVjVAN876>KO-rmyW&3SgE{MI)r?5+%XE-?#R5iyvJu?n zWDx}~N^Y?Aag1ffEn>@#e<)0Y4Y~NLo*;~KODYr{u0Gje1`&VF%^{E9o>JNIJxtF) zDB^2JMXpC+V(C8oGy+!49OyT*XxJs@MPvi#i9Ci12gCCPB!(DNNDU6{p8l^#zfWH7HKDjKHp&rbcJb^kgUaSKx;^3Z5gw{@bi+^6|Gf5r zUjfcdE-qOQ6Bg=%%b={k>#`iK7&UcSw$Q$w{!&5j{l$oxSK<%LigqSK%prN@)it+3l{Axy0gw4@QE};*buq;UhjdOET^?~} zRwVF^)6!V~yG6HKGX=u7pF3a}mP*ooJA_QQY2|RzC7pY5pLM);gUzS z4868efj-bZkgsb=b*zp}J(Vzdo@Q8zhy_pUPBhuIUL!_RgVtx2%%m13_#@0@eLB%D zVGvJ$S1|Y^lXW5^7tVa*gAN(x^0pYCNEuT|na%)5*70{-6FYK%noRtR@O6K?9Hw(5 z=F|6NmN7d9<~c&=)kBe#@81WXf6t?CrCRHARPdvb!GAg!V1qw2zAiPW@xe5z{B5dV zdGX}k!ZmaBgaM2O;U*8i-|L2zT9@`&7}a>&+CGgzkU?@Q(m7gc+TFRS|7OTmMI&)g zNRfNQ2vp0I8ku@Q%tJumgeC5aImv(5Br{8ZJDB^T@chu}3}|NB2`lUV3mGUblE%RJ zT*evun{fL9<$!nLgUq;<{Fhep_L|tYYEG%F;NL$!u;(sqTXhk6~m#O_IB`g zz?_C0|AHZbj$CXNry_7PoI=y`Lx+(hXXEMZapkM7Ax`2zj9iyQYiz=P>R|_!^x>1s zyYwyt7f(k^UcZ{L?AU~i@VV^ic$arzrF*&(JND5nY6&?z3*D`SvR41XVeo9!ehj=!#F_A#4_`fFxz;v z#@5oMA-yW#*@wVfx>^Z$%;^{G1@(`2R_blsGR_&+ z4d$f}h&MkRT2eVV(J(Q%v|N|co732r0RwKiRX!yQmYD@vd;&W#CMbWGw`=HFv)RzTv~ zwGJ}Ut$EfHlX5J;slMq$R7yv1dj7<3#EfcMcdxvjQU606o)nj_cJKzd*ML1KSj4JV zW5o->!KNS81Vuwbr(CZzfOX!BjYV6)KEpq6W&PkJT$AQo>C0ynO*34b}=~mfNck2(brRq&nD0}LyLM)AV>BB)(7dm4#U#MACuiAyl47UNd<`|(tDbbEz4GI^ZI$~d6Q~-4C{gS& zy+!Y;#x&DRy+7Gmgi!JD3(-&VP|yoiJvRS%B-wWaaG&HUjIyHnW9<;rqkjg<;Yy^v zx`yR7j4sE*u?g?n?xZt(TvjmD5JL6g2C-0qLcBc;4S+1(5B~k*kW{rfhwF*K;JFU6 z^8+QcsIs0`A%v&@z8159i9dA^ir!#x*nI2BX;s%BA@F4J#Q$Yw`0~(g`9@7uFC%Wx zNgyn0XzTe_0pztFI#W$HAT+B;G6nfYOrTj9b+>~@h@Z}VxFq8pv zb`w}52j$M&wb6ImJjPoH1ep36^-X$}cTT zx3PYh+VX+#V9rp($`t4OVHDh}K1@-Z+E8gx!5yXy+!WnWYafjAe5!{F?qT(grQx^X zsynS__mcH(6LjNAuao)Lh*I8L78Vv+D=eeBYW1+X;nUKfZgEal5#qhqkg)vV$rpPa z9r!-sVvn%*(`aTxguai1F+ncw7}>Dex@{976_5|#l^w~V54l|;*imGsxeU#1Zo zPjr*pIz=_W_{NOk?m~9y5uB4AVD4PIJbV%g?gzDqWn^F~4)l6A%{0By?krSwG^OSb z|5spc3f5uqx<0cn3v8>;54oeU`SaRsXoy`$Wpp`>Yc%-UebA)z#>2fmrLy-8Jtp&E zMT7@x^Z$Mt*zow|1Pm1%@TTjx8DDJ;U6F7IyE<`3nyk_MZw8~rhJBS(LLiN>I)3_X z9#zNGkPoc+y!4FpdeqZyGk0y#(dYzh`xW8&XDF`SF;M$**g=dpJ>Rj|@N&#gpG^0RQ?Uii*;_R%)WK0RTg6mk|3Vyn{x$?fZ?^uPY zODzjKjBW65NA`vI7KaF!HvRT%SOYC4tgth!EIXw}{3P% zU0?5cTlwAW7gYp^l%s$Qg~1PO9dK}PiefCIe~a10jrvBxUL}Ar0wvuT%iPoIRA4#T zg-Dyr^XIS-!84Kgh=N@P$Aq^eb6vvP`g%isJ)jm$6MW&Vs;PN?Ms#+4H2iGiXJhO9 zkhciJj$-F{YAb3)z%lyAwt%g^og=*+2z18t#|jC2r>@bv7hZJ=<&@{DBB+=#Ti06! zg|e%%mGH@D6x7JmOl(-x;u)(REUOlyVAR4X`-U05AwiSjZr+ybaLSXawYGnPvnp>c zpNsJQ!!Xansny!bEUM&-W-0VWsBju9AnGrY7MjyM((Gg|R#Trt#7DT{;X@BoM!{)J zKDzJhzg{k**u^!9624^`uH}S2b4oDAqRf_q@e=M5vL|z`XCCaAKfF2Iq$VI4(NF$a z7qq0YDCKUhAO@p{>%L!zeKV6mt>0WoT8RZG{a}C7mGt{i2A*?7O-;>y@{#b;h!H`l zU?`6Gv`uIKj{RO!f6LA{J#UkEO5gQm-Ec# zQAwP^`Bl|Iex^@&Po5V9?f%;6%}hf)`|_?ph&X1wW3^WJ^RmCHqt}?fJPn3rbQx=k zVOp@@&^hl7{Och}i;7^;IZe|E@6-H}CQ=B<_=zn#&cp#L_zQN4E=i?fnIvK$({O*I zqG4WL|Lf<$4?T88$>1YOVk0|;{p&qREeD&KelUKOe;Y{@JSg1|4OiMO0?sFDB3a0t zguq0t&GRLFCq+9wCJ~ZgE|R%E_vdK~*GPI;HjHqD!Sw9&3lt; zX@7GvO}_0iq*qYkbsc7%-ndt%L653jj~Qg5F;0kL4|Td zq%tQ%h3+52WmbbX#(E)lcANgG-J(o2uXmhx;P*ZCRf|8!};)kA?*u4XN(%h0}C z>oo(@JZ4n3Pj_E0BZ9gc&m&mp?^eH?KRw&Y@5QsTOX5p2TEVxHA6Po{3m?`N@gF*K zxlweH9Q{tObkX^rVgv{vj>8-09b=#z=t9BSy0#-kO zQW{pP+3NI-em!>5(<8j6|1!M9H1hG)C1wRc7ho~QIsMLvSX3Qdn5^pv!GMe^kQ#3t z;pty0qz?frT1VN^l}>n|73(eG{{cbGBXxh0(<9ir%REPLeOd~F$MhEsT&R{Mi(?Jr z`OV>nktz~i;1(}B6dWbxkjK#jvV#(b@F#0?v2r3lGyw2Kf{*>6{DJJha%O*K5|Nz1 z_ngRLwsed+p1o5FAVC>K>b$z?v5;h#)O%e{-Kbzkp_01M8=< zmzOuG7H85ieKMXwBy7Z?k6CK4$6uNI>|ANk`H7DL>^4W93AvyZq!#x+(l_IOF%dvZ z{KM>V18kyD)k2h&(yNf=7Ts0c_y2j}_`EuoTH>o_bF92~D{zfSqeR4IlC7sne|j9u zuCFnzEslU4F%7Tz$=A#*PWRBAP~T3#r!cF;asGT7y;JlQnYNhe;Zb(9xkEq_I`tMG z(-uG7ZZlqRw$*3(^Y>}O58qtN>!{~%X7skWRmOJb1e=a@Sg-Up$Pvx zPg?X|JQPKzOkNcRYFD_JW37W>mpq>N_%_r?N41*n#UY5+c>bw_h>`@F5hl4?&ySHA z8RA31%a2+KajINi`zVtwG~keZi%!PFz}I%Eskjx zf75r4;G4_-kz|93oeTMcYxB=2NZ(Ljl4KEqQb2}S7II^#187OdjgYT@LQ!hgT6S<1 zh{9BHk0;j7dHJFWzIT8*s0#r0{RXd!L3z`YbXyFKZQO%v3RXRE-^uXfIl`aVti4{# zojY1S-mY@t*C=H?>%{gX*Bp`78&g@L7^eP^u%0&}Ih40mI9|$%`M7wZ9mRjWA!ydt zZ>=C|sn{`eBi<3wwVNyI-jnrNVR+W-;IrrVJNb5JKen_xUqM;~S+J8CY{UsWViyou zY9fwmGBn!KGhU0ihDn3nrNN|~A?Uq7zT?70g2z)+a$?BY%G6T9)Y7m6*1CU|0*cqL ztuyB7#a;2JY&uc&P#k$Mj5i-?Eu!;l?LKPJ?3B9clv&6B=4ieo&Bt>=!ah z`TQPX@vxv&bVp8Lh1{?z*~l-7JkFhW>RRS2VHlblDs&kNJ_T8CIseT>t1;>s>as`d zAWV%xwbjBE?xf<4s1!b#eaM~in*{YpCR}6>C0kuv3vO&H%1kt6ODnB&AdU1Mzuy_D zQFa&yzsR&?UsvEx9I>*BU7hI*B3t^}Hp@2;g;%byEqgPe(lFHHE@(hBnE0stz>Bp98F2p%v7 zqNiW;x<#g2oru;s7R?u$kJsTPUM(tw6{!KRM{k;eHZudcO~A7D^>M#lt7fLd#oR|V zC<&ht4R&KTzkQGT37}NT#B2;6yxZ7mv7z@t5nzb3nbBop6AMUG9_JASzl)rm1~oR{#dc9yKu~qF|glHKzCaYUkBvP~+uB_poI&69?T|I1ocEHSnMc zPSPJ7a8cIC4!yika`Sp!%`OaBVa|?;po}3*Nj*wP-h((sq30|GY1)FB&swt#ob@e* zptARgQuppt&<=`@Jx=N<=&apYP1CpGijQ?`Fo=5GY!cD>Q#kchg~&eB-Sv3k>Mavk zl3G#_`qSQ0UC6Oku`@?9s>Q0X!>BaUCii!8PE}h=a(HN4US@beW?cYpeO^g;d}>*I zY+HOydRaFD9`nBr0HJETbnaCMGW@ zVaO->X4;9YjDjAAdJw;Jh_-8pxucKCHPctS^UmqqaL+m!p!>?zYu$O0{3rDf{?<+Q zvA-Gtf~&17LiYW!2X`BAb{2&DE5s{gFl4Y;aL`cD5cwdvNulw9$w3lY(qht5<6B?6;$n6A*CCEhyO29gw%ww|s?78yV>3uzgHx+Giwy zluhQ+6K3zkZywde3RTula&0Wx|K6}>NVi{Qehc7V=pF>_|8_n~sO6XX<_#|pFqDf@ z@mLe&{l5OiCE#(|@c48yQj-XgGK<7ECpfV$;MDf^$-LR-O;{rm+`{Zk%e|dJjE#uT zO-Kg_tA(LcNCD%b@B5{i5Z+)dY8XNON;wWG zPYG2X5??(ioNieR>aANl?~isCr;8@1SVOJWYs5t7fvq*qCABVREH567G|U!QN@f?o zvJB(%xeKctd~JTSUz6b=A_X=3J~c|N&c=7MqH-u3$r|Y7VL$LE@*aY^4-4o1B}MI5 z1h#h3h049?*C#vg5L~NMHz&3)8m0kb*TLdDjg0yZ-ScZAm9sT7EQ6*`u;5L4O(ctu zO<3%k;MeU86XD~~XVR1|KScpv;a~OL*p;*Kv;K{bjf;zi#fQ;ZDK|4W*`E`UZ$ z?8(nGcF9{bG*pulQxnsZ3X~)Tt#%x>}yWc|!uwq~&eZThKS$Ml_ zYN*;tNl8fiNkjnSu!x9gplBSBkQ)ak3>pBc{{`X~N4QVkD~O~{!9|576B-f}5*#8P z6f7Ac%8snGGdg-QTAEgR^7jNwTh3bB+L{j`6$vvHi7+QE&RE7YT1f+QIimdHMyU)4pP!|vyA>eU$0Gu1t3s_j?FR!;Ao!t-- z5R^wocIT&NrWj|N8e1E?L&P=y1+AGN%+E0|#y;+K14N%-CL*w2pNx=zH#zr2Z17d2 zKa_nMo8hKEkg={=B8U*uI2_O`=*il%(wI*~rjO0N2@(Is4{v zk=il%{2Oo^1K$s`C#c#E%qvOPe&KQ~;(1+7=G+>zmtr@ZDqgwg@OnKc1ju@~R#%lB zoRPJ)P=7HyUwwOc;Upo)V)hk=+8R7Sp+6|%wwWTc zx_$$Bo;_j(J1O z^GMFQ0?|_qqnSg(xfDbuaY8-jPy(_l7uUDzi(~pNzgGyLCRNZq0>rQM+yq@8+>#a} zm?1baKWhiU|ML=JT7w2T)+Tvq;Vysj(Xssa>3#BiSA$O_-yd>p-yY>(N8Gts^uYBspYY~jqch6rxsUD+_ zAc*opWg4Dczk6|0mr~yebnUaQrjL$zy`c<65bP*|Nel*CR?yruuLKW4Sjk4zhq_mU z!~E&cI(Tz<88F;pHt2U2YJ`ds4mD+9552qfNBzabv^>GDUrD$bD^8BUIdWzN z0}2i*&DU{mWkF|9No!cn7y&y+a4vJ|#P15+M%d?OA*cx&1c1gmF!?+)NL z1^`J}D7K%Rx469a0|c2UT;4e?zi}nli-&q~>k%^0L-`0^rLbrmQ6Gv<*>Max&9ssp z;jff#2COFUQaaXE1;JHKOfkGdJnq$dLk@p&0Rs`O!`8Maq(1ED_h=XK3R@2_MuDDV zShK1MV&i;9%Sy@&93%h0dD8{L?Yp1`^*lw6tScFSd1-SQvaksP_B!5Kw1r2K{Ob9d zmp~g5equ}ILf@}*bAZkD=NnXiP=N)O==}a;5OQF3E}~3t|2t}0k~Z~BX-1Y#GwK-r zJsVOT4VlO7ynaEBQvjHpS{M4I@EGbd|7cI)+YO_MqjP6hqBS(dPeIhi%J8>w#Q?X7 zx?n81*}m#yBS{*1*c`;|SnvC=dK!}f4-rpqR4|y%$=oX8KfTl;erW>B7@JC%EXn(J z_4=KBb;h>6*BR%d&`ZhEgj$;ALZP(!7FmNTp z0hwx9{-(<(28w`6h|1ABkZmcw_ap|#57Uo*;9F@}&U0gJuLdjl+`McIl}BJqZCi2S4?_RfNfrQD-LU>{{9 zni0C%;E|rOHH2z~mh^*=eDOP$b&P>rK+rjeil)=Z@sY{M5oKS6P#kMg{PN#`l{pZ< z$!8P1NRN>q=!JkeU8ZHXSdS7(^qGg9cU{Rx4hi|b`$*;nOS3UXW>QF$#xJiYnyFj{ zbWSgi-pXsw)9wwgL3KgT%DAg&hyDU(MSw#=^m;AN%>Dz9ta$wb(ORL76Ao~E)nHEE zY$ur!U}Kzq+*m!>QQeX}d)6T^@^PSO5}FQw9+7`ss%zodeE0;?(#RzFC&|LP}nm0{>H{_x&F>RGi$YODOdo% zj>{yTeJ;t3IbO{L_KjT-+QJLto80m}4d83q2IK=%g0Yu=I*Gi65ysk_5k0W_mAVFwuPA_CERh{+wdNVO2Ys40x_>X_3aiytBm<) z;BW}hp-G`hfNui^sk8I^#eXzsT5C)JcfuS3Fmzo9>Z)hqrXfCOkv?|(nV%=cCBR0( z_lDiZln(o0e~oP9bM0IC=N<-oD;Fg+!;mn=9Vq_X>!Z6O&VYE}u_s2utWa;;<5SP} zTQJjJI)$JZ)ndy-UFl`l9l>&}vVXTA>~Q!j?{HeA4My?6-mkhpkWWkIy2Q(|!O*D1 z;JN%6YA;=Hb;Z=lE1Q*3+DL|xS|$|?It z2V4v^&Q}|&1r(n7Ida~uE!h*Tew#X?;Wf=0X5E4w^}io~r5kAg_yphey8Mh+H(wgh zc4{YETE7&@xb`>JbKLWhe&AUGofAGA$&jw;IzP82eLrZ%e5>&SqmMd2EVXZ$h5Rh= z)!jWeCVg+6Ja?i-Pv?`K|FOGoEU4?ZZTDb!`sYWp$LK}qq0p%9)Y%f>dHq+|%^2!#&o zzA;489!JJIAe+XIc^N`eF1XKWvdL#WD&gVC#@3_f-Bc013%gex*Z|;Ng0`1}0pX%b z{BfI9ec?>K2;|Nb4Cy&nSHh!G5}$Vk#}OI$yK&0p$QvKv04IdKpE002!vD`yA?7s~ z`>2Cu#nE+EY^D;oeB|lyGYQZ!tyzNkQ=dvxM2*G)z)kCqKM{s2e{?i(_X`6!nK^5? zHfddPZTpDl*P<$Y2o+hxJ?^=r!mi;oaf15PRo)m<@vqT45ZLztRM&q=(^8~Hr?#T) zl7Ic5Km%zq+gczkqkZrtqAb6J<_T*r1`BM1jr@Zc&)Gm;lND(u8YKL0`cwCX7pBd2 zK;6wYft3d+2{X_1gMO*($zgV1l(S|mJV9QrCNmHBPv#E~!~C!DVn#jc48t7H*O+RK z1fS-nJy^&n8YKm!*c++lR{YU6EyvDKlZsuHk7OT}W;OHeRET@q7CX26o^2*L?DH8E zP&`{989>N1j%JceJ=`ZFp%ZRbOC5szzP^H0RYcP}cB_|DMU`H6K^l*(nzogA#)-W#+kn(-~21z$lWK& zKHi*o)U6ocAgpuAjVoyb)gzc0Eh)O%C?IH(QuwxSX8jrvt%+HIk#@HH&_Zha+?*9$ z$pgFkw0aOxq_{a#MjKD6jyHTQDXx@T21;_1>?3#t7AArFgbOvH}> z>~C9Q;9h{1t{UEtWWARF2Vl|YECJ%`KckehK9&_j0o6JJX1Qo)KHD$IGMd{A+z4)U znS1-}A8|*Oox9WKLY9@gZua%d|y!@u1n{aKYyrUaH4{2*U=8jm+Ja61! zbb!<24vM(+_+WCY4y}vVGKxW5G_Ymm9w`@68bY+}DHT6%L1Ox)A{rM|lGwCVPVg*u zWjLZ!WTjb@685ibqQHhD9#*uA2k>mDJN^Npj+ru`-PbEm$v=pfUGI&o?+1JVqoZ3- z*ar|9L6Gc`X9_}0XEOK8$#UGMaA;MJ4`B_9lt*(6=&(O7b#ktm?{EECmNADqjRa^Lh<4PnV?X+Vic&5-w7K|;PN zQBvCtH8-`A^rVmiBr;BBG2gt(`brMOjB>L8tzrvomdzyD*USFw*PGr~P9U9HYmTRB z{popmZj%z2x}NOG;+P;9zpB;sE1)y|!{v)bPs=GkQ#i`hnKPA3Z%R^_If3Pa>-OTFCdfUFGA&z4f8JKf&0yu+*pV=j9Vo_dN z6)}ugV;c8br*lAV#%F4M)%TQuA#=QiAMt0(p=)i@t&8g35no=>c+WMzIvmi8>ckL~ zCk6lC5=ut{tqwt0l!I5)10mQC?#$t1M8VI#4DutKBd3i|f`}Yz1j5R$DxkS(-x_gi z@(13|@U*g~a*~c}JSJgT4_g|_QHjIJiXNOx(C!F;*a4^*v=Rrj(vx%Nn6nzmESx@k zZ+Zk@Y$u$WNGri&14|Om69yF5(lZZ^P&wUpI+7V0dTBaV*~%PJU!N--gNmeRxT3;c zXLX4F7mG}B?)7X00BiTSMwQ?d z(T~whO^;1L4p{Qi;bAhsvlk5&4F!ucbp{Zo&Mm7_`A0=m4=^-F!yiTbcc5hg#1`F} z@0o_3S?J-n68FE$sO&8h_*Ua<`O@0C5>nEw&vr{^7g|Zzd6U@|qFb0S6ct&5`w?aN z^aSA2aHRqaeWK@Tn;@=&7E{X>L5LeJ*vj+q%hAfvE=b5?GN4QazT`-l`Bv zsJsE%}Hr7VU;Zl8A%-o8D>F+jWn|8 z#+_`nm!*x{vaWM*GD%#@s%R8|f)%8HM&98S-LBsszX(O-Gs+)_r3 zVqX5V_}$zsqNP<-HO`bIJhd+`|04 zy+iBGXjpF?5u=}-n;jyjof{@2Cnh7N8KW7cqM4*3r6i)HS8d1d&Em`S%OjASKzxo% zDb9?5$4$-#%Z^ya3|(JOk$4c!zlA z@VZ;-+i(bP7HaZn5h62XDvi@qjnj0EbPeG2wbfP3dMAbh?ba-}Hlm&#&D*9-a?b0G zZl^C@lcj<35=G0ioRu!m*YZ>Og^p%7oj*F8I-6bmT|j&g5I-k6AWAwwYE)pf2&~Kp z2aAhn+UdX%g0HV0FM)w}LjGewpdEY^^4D*dI4mo?DZC3mAlt9hq?j>9w$a)LXCBz# z1qaq9h4?lU3!YUMCHIO>{;B_>KuHoaS~JAJkq|p=+M=}8kNU^l-G0i7fgNxIkZ_hV zox$AAVF$A?*1b+t)fIQ`Ubm1PC2rn06=0b;5G?}mmzA;Z%14klY_n)n6F%i##DN@= z+a~8!QQ|VDTPqFhImr0&o!mE^O*YaAQ~l1Y3Q*&8CQu#~nwc^nXUsV}0eBHu= z>+GcU6u=;`-_kQ6w9-P?)g@NfcknLu{@~qI1LT@Q8Nsv#0Rx1haegZ9&^qbB3hTr+ zEpIO+=O`!R-!Dh0ou%5`{?VPi(w+R$xV`6Y40r^eFC0()Z(SIY1Co6|TA5f%iC9i7 z#q@K9aDd@Qev*=lG_X!joaDi4&4x*?vZ&SuRs9b0>3G?^v%quZ*&X zqMV=%9Q0RTUk@DC1w?iL*&RTFM>$QoWx3^<JqMe>3yvzYWNO(9nP0h5^(%X=?GAPe7!%yhT zsq2@Vxq+D(My9WC)~J9Qt!Q^Bo=Ai$lqof?o~jlhk2vv;g0qV{_2UdIPk{E) z!&Jt`bNr`_BRu!%f9NYYZ#lVA5;6gb9YWOGC9)4x6mN)OHDnZm;-PKDK7hvB7#K;~ zFR-H&+lq*!8VcSF17!l!hhd1=xXSJS?oZs)iIhuwoh39QC+HRtOFM>3G|9ZWNpNk6 zd2Jo(R3Fpa5>ej}R?80C!~tK&2HnVX$H{ub&b(2NSl?G14d3}y|C!R&E5QrTReB7~ z0del>5AVDQ4}#jKY3sRm^~Q7ctz9>y{RbRP(Az&bIPl3RaC*5wI{85QDUm89Nr^R$ zArKj!wRxp^>Y>f_ol}`7*O?pf3wPq#FXit)jQ&Qew^BGHQ$xpo-x-t?bA~5OOemM8 zW=}e)hfu7~5s!CV!_4W1j@_H}neqgGrN06I*cP+4`+#9z))3!swDiZsW zqj9BiaN&wZGAk`$@JG@1G3b2}DelRP2>so!qDJC0MOYOe1ChYGj4B;AyPgi}(vmH& zIFcz9Q6JkiWb+j50|hkrOP|U2;}u#wZ0x4m+EMb1c~e%bL`3(CX zD@M>2Ty`yRzU1Y&VoO%h+oh!R__VP(NiDw0lJC(vC@-E2y{q-p$Pa3YZ;UejDdy-5 zv7xcE&Yy#PRB@$G4KMVngKfpQOuOMV{N^o)@jX_tQ;Mz6aLdnB@u#J2$A1*-ICunX zY|MNUZVbByYg57FjO3J)+anEbvSzebQG#%!5)sm2P>_&5pIYdsvZwm2%a~p+TsDQ| zPZfdb%M#xQs#=qY-`WH8jt=`&lGdEC%CS!i*bhxQ(@!VZ9ZS#_Sm#Q5G)mxQDx3rSRJ1u3!o_j z@AAG)&=czozEExc&_6D~XFbPaIzJLI*;i=@4-g_yS~3Di=tRbBT2BM>@LSkfFE7VJ zGImCSkY1LJaj0T|z5!yhfNBNhCi?sE~QjA|Myn?J_2nx1`10rrt|3k?o*l)Fyse@q}EFCVNw1os+ zJ;KiMpa{Nx@{D!B^L^q%3!4O8WeXY73;*h5U9gM`B|Y*~T=)t7h5Aq@H@%=tQoO-+ z5Js)w*W|MvC>o#1Sb?mCN5n!jGtYl)y5^?=vJ-{sN$zO}VXReDk2V9~QGABKx>B@P zgbdm=8PTAc7Xc6aQrMxmeXYrtuP#bWT^(q){teG~2t;n^=+IM9+1cLieeUM$79|Ci z<3ObPNgN1-fJ}C(-BITIkinbmg5fo^`+KZDVnzs~s8{6=9QG3DCK)f<2f!1iP#5WxXZSM)-81uHE|Rk?l2 zzTK{m!}6G_xtQz4mxR$=5ek?GgnV&zj(T}L7X^$0;bUbbC7{T%3iOXx zR|R>WxrBs(tGbSwTI2sb_m`);|9kFJ3dBSH}_9Q$sN8j{M?K`I@@uoa z{rGCA?UgMaFf+G$thp`;6lxhC$z(@O3TZ0jMH}Xz0W;$Ij4}4-ySq7=wCMQNL#ay* z?ev;T$1?Ul7rw2tR5Tyjv963J<()mSuIp{ivV4ITsr1=T>nY(9%RS> zrE}Yjj-WfR-2{r~pq|w(WAE_LKo+o+1BlJY3W}70slT62&Nj12C><)xNR7NBNGQE- zyAfUYOuej6Y~7g~DtqLmW&M(l^TRTWOUbH20>LTuPEbkz*SV8e&9Nzk_7kgW&pqnc zjdA5eoGHeu`Gd`hy2Lb?hk1 z<#E@DHe!r!wPAi?z6wL<-x$0|w+0{4X0qlgqZAP^wjSkj-l%VEOpJ*!G&L2ci~|n_ zXc8J41AS?d1<;%HhhqaNIegHG9UTE_V<3U;pZc2^Ic#O$Z2b@EHC3hz*7TrMlN37 z<&6y?LBS4SUQ6^D3kwVMJ(E7l$}<-ic4RSdwvFnw8_T1*j%60agdtoLx$)0TdAA zWu@sdhUp3*h@x#r8f-hclKEjb2teX-E~NDh`JBi)Pwthgclu@?gO# z>GLBbpUeLIILrV1l%wzSkTpXU-6b+(C+_Ect@v4G<DSg6oUTwGz4E=Bs(IB4y`J#fQGBlx-2{hzr)e zV1V;_yTSxyn9ADNi>k_|d(q^Z)y|K4ARG_dJyME;Z4$boU#g4&YhJ(8Txj*PK8t1S z$$91UGdVP)*5_tD#m2RPgGPilfF0JN8yV4iT|XRir*K>p-h>1|Q-=|_-$3quFRj(~ z9a`+KUvLNreVcaRga_sb)u;zw<>cgqg(3c!y169>&o6d8Jv?-$0iD6Wq}c>~e0-Rg z2|$+$sL_D;py!VB0%f*ZU{O?674zO2{Jo{+5nH~hwsu)%rCi>iZbfA!76!)R@^WKC zL$@LjfcYDG;e>*Q_73GoEYN-LQu6zLdwYA4nv?`QieamS-R!VNRnLq52fe&p7VQbkT|z;FRdGA;d1FcOcI>i#krBm7PpVd$?A|j4m)0{li#Uio?~5J^o>beB!B@ z`mp!|x8@K81q4oH<3Ntvl~ip(&3#qp%MsW{RQI(PP(mZh-cmu;2R)X;!>6}@Y5WZ3 ze(mn!_o<6OtMr{6J>aQgYx5X{mN4=kA>dPiNg4oT=j}ytO)B+z3uR{&G0dxrn*<*+F-<&2)e4G6Y_zgzjj- zVe*4p6dL4z08vXeI0dBpt=y9UFSE?GtCv4vws+7L;r|!}?AWM8`l$3^eY++ofReB(Vb109gt7 z6OvZ9#eo7y?h6X~7#MVIZ0PIhtplmsOeEk}(4{vSyV7I}wcafPL^AR5@%`&r+1NIx z?6wCG@2}a&oAv>lfaNq!qmVfKYx;@Iq{5!=#t~c!*~AT1$U_O=%AahNcf~WIvu~8* z>bibB%|GQ3y!ea2*f`9705CEEBKF9XN?v=M$Ts@UNvW9HC2y?IlZP%{@Bx-*3c48P zW>Q8G1#z@+0Lmk`Y z{%!;ML5^<7p=|gHG-09{3TYKB^=TOdJ$blazm{+Q+wMee8t8darDge}JO{ulY>MyJ zCg`3K>Vy0gAMx_so8{j9o>hPB?%-lvhQG`jkT|M9C{nthR5Bt!Qk(8_H~MT1mQz{b zF{$KG&yW(CJkHSAxfz+}Uj7D*U59rqj2Ow)~P&EtDfsFYnbR+fxo%@55ul7xg zi1)D_eUt*xy@32?k{_8uE^K324vD-!^aiwL!J+v11*AR`9-{2U%|wl_xnVV#a9tEO zG$WD{(3PAzDTvJ?Z1KqN(Lp29fQ5~<%YpHoaedN68b=n5P6h zm3vDVy%xuPq-uDDr22Z`Zt@78W@JPJP}2t$mA@xoy#ifH@LGV;85I>3uqKCW1A{^T zT!=;p?yaq@Ki@8Ihy7a#2nl<8dVpyFn2GR$)Uvz`F>6ruba#%8g|*)5Tn3E7*B$Nc z#l*x8kB;np z0zgrqB&Q06PR3Ivu3gm|nPZcQzpiQgY5P+1SOsL4g`vRG3I1iM@G&=qrZ7UVG=dx* z0>(u1``281Z5!&XMyX3L{n%DkCWUv;+$(`q?b}chFceD@gjSwJjXXD{QkBeLXj|e+>R{ou8a? zT0}D5J`=7vOb@co%v6K_TJ*f`MkHzMZ`%;8mOX&zIRh7?lDymz|BbE}pnutRzP@{g z4V98&Wtqhi?+wcODVCN4vg7OPlWmY@WBm<$eFMB-(dBgdS`%0AK!J-n-ybD{=7Q{c;*?8)SL4ICJks=F--*3@Aur#LqZ{5KC%i*c? zss!CP`6#^mCO&R!REATVm})YS)n<$W@O)7;&(bf$A~DajE;iai>#4CKoKbYYF!0$Q zk@Npau|4wnc+l6~-3?mBfq}x};)BcG5o{*CZ#xZI+S+N^bfDsBZg%_ry|S}2i$&DV z(vp^%`cERGACWNPz2~=FQ4(74nxTb3-*kBjvjgE|;DZCf0ge`6!s3QbNQmeJ!(C8b zJ{p=G9T^Ez9e8)~z8P^{LR`GZ#f^7yX{np8E#wsV9YVNQGx?FszK=%sW8j9lXE)t| zu7FuBFDuJ41dOFyS^8d$YqxS?`MKX>-yzs%bi+jSEMtvM793AO^Ygzunt3WoY^lV1 zYJoj+`0$i@%2++QpS@$vTkyADt3?F#1qFSJig$B5tw0umWPDjt%S;ry;gg{SGlx(w zp0J3#a`jOmwn_tw+~fjU1o8~bI_;m@DMfS*qOW6LS@!n@1U9nThS;xgTfRt(X;yX& z6vtq#rYK9)D#?J296;B7f?_wJm~fpBd1|G_J^UH3>s-s4Sj`#3?Q;T_wkUW6=_CMX z)s4zE5EgCg{;u}=Get#np6~0^*mibzA&xuExB{`W-$>Z)J%D=%hjg!(`f=D{*=4Io z2bfE1HdS(WM*(ia!78riUiXC$Cm~GHgh`j!Z!DAmuc-+Ot98m_Jd08NA;*k} z`49>32ssyio%vn|gloR&>S81WP zY`ngo-VFv0-}0x_&uR5vyB<&U?9tY93Wp(4@DlpM5df9l%KYn@W}xS>#^sX%y$|Z^ zRv0V&(}BOEp8V95@!;RhF)!si%`V}KhSw5*&U*(r85y9SWcKR!%IxQz(&Y1BYjd#( zA=&D5h-SRLH93jGFIISvnrG$WlK5{MY%NTQ`m_;ILc}R3#gC3HK0=DC=sAu_vhp}S zhDNnQkR`M@$y32-mRtb{7pRB<0v>?Wu4_MG4~BKJ_D|#swH|(uI1kE`kNsdDyhUbx zn>Y=4t{+)-giX89yXaDBpFum=Q8^qq#`0e4bFnRg@VDSt>H5FBf|y%*g{57+T{Obf z_J)lho2a2oq;yXaR(wk@H*w~G2C#f_ZN=1^boQ%w9Pio9FY-i?k>&0C@cSK;S*$B@ z>`O6|AgCP&VVT*J3>4khyq|S-tV0_?S(}TH^aF0Cs`FVMK!e#|Ju(H-;jR7ae#h)z z6wfM)`S1S&P^Rw?Hbru;o7<%)XC%nbOLHhwSFC7)h?gq5cyfWF@~+ze<(wJs-d(Y| z$B&C$IHQNXeoanJHAW^dgdvD{rwn_)a^n3APV*Oy->A%3O%U>W^8e1_j88k>kLrJK zUXJH#k%%TV+4*Wf2uP8@F9?_;vHD(c08vZedGt1}>-0nmpg;dd{L>=7U{kCAty^N`-;Il^^r?)gX+UoK@+Cu2hrhk8g9T_lFj6K&&DbJpxzcPFNkl z#Ic%M&5H9=zCVw^LSA{Bk33RuiiGiDjLUJ@HWPP+ZheNI;zz6i@HJ1_*fgupN3<_E zD*))TtSt%Y4}uGcy`LEel^S;L{V03#<$d8kkP^5cPCuEJ#Dh8kC~*bVg9GYa==7;N zPal6GTEO0R*;8`ai+#V`!y3B}^eWyikKZ(K!e&g_utJozEkA?xc(ZWzub3U-7d|x`h(dRr@u!sU}?fI@Gc=Cf#uSG#rMJl zkC>P$Ml#~yK1*Cf2&>Q=jgDoW`y*%Sp&a=DE|2CAK<@fO?Rmy(TZy87x-pbCj9N40 zEX**?pG^PKRlhXCA%@&dL^TN&ebDR&L5AB5G-xE|7$%|~?J)G;I_?VYF!7*Qvyo48 z7_R%EiLb{dMSGfl!4sNOdS5xyO{TH94HiSL%fE6Ien{v&EN8_VSZr2x2+&*cdVe`D zK5$!cXV=#f@oGvyv(C-)wevneSF_`)-Dm-pYt?_kFRRox&t4v*nKMgDd#EM;`>8bp z+g#AQ9)&B@!|kw$LRP$qgRHz<1VhjlV!c=5WHVAYENT&GKf-w7h0s)z%DCCIn+%~i z1MoU(D43g<)-~;*Ca~JpwKDYCM0moTt{c^j{di~+CDv>+1ek8C?P=i&8&v4f1iMDA zhTqM)Z%QsRvWULq{bu%FxHDpYWnpRBwp@%%j^){7P0MUgy18X#R+E;7zmk2K3OL`d zkSw=V0A6!WPV86nP>cXc0t)j8AY0Ni5%e0Lx}vvKvIp`YAd5|jEO((qM!#E7RGK7& zQqne^>>WQ{iR2DL5|6OUJ>4y$c1pgl?#bZrszZgD7A3+0;VHn~(i1GQsodYMV#6-W1|mzKWv)x^hTkb}#O z#@gRZyykX7n0B&Ky5crHxtHV-=O{AdlO|)47{kkT&y%hqIA&#Ycqrm3$|W&kp=G8WkXr?x|`$TPEbhyjczZ^@H=O?oiEN{XanT+}HOl zB?b4=6+dAliHVAmGB(L)8`yIM8uRe+d6ND~eMbg1IUSeizsu~)tBwYH%6a>gL!=h8 zy~C!?MmJvj|L5wnUKv&zp|Qn$tbdmOtQq)e?~%A_<=J%cg4VqF+tkfo`eJ@86RGTH z%yfe6@6`4q&mieRhZtHdykDhlgJJ&^(DW@-|V+hH-GHjVlIoW&d`d65eI zqsQzssp-NT)<^hvI$iTDEr^~LhXmi=6Ncv2VU#Tomo^zNFztz0B@g8^~W_XRyb%-kA)l%qQqPC<&N>-=+plRpwbt7ihC-qHf>FCQ~T$b$xcA( zVGq2rf{*pkIv?l#s|>V_ zu2{q+)ca$;P|2ZIZg9Rddfy?T4ds4~{)XwG`ut8V_yO}hF$%XE_k7G^;&=5AX4HBQuX++tXcI~~4ToF~CWx3w zAML+CddIQ#LwUhimpIg>kjf~^N=-jrrswiI41A5Z;u#Sio1q)_*1>S2-Lgyn6sqOr zX?ZA&cv51PNqjNHhaw%$Yno61fc7K&hf-r9v$olrSg2E$|KfLMRf`lnsqhVE+iiDsBXv zhT}TLQFiDwB8@A$-RMV~GHsV?fbVU$EspKCr(@uuN3{JyC7&gTiGl(*L3U^CPlnM^ z`fD;Vb}~_pUt-RFemf48ytQxuZ%ows_KHBEQ4L)Yb038}*^c*gR-wY8Iw7~^V{?G$ z&ttvHm&eS3R3zRbiF(4BRguQTM46A?tF1`f1XJql>o_V{Lmqw%oAfY0Ya?BkCkAuO z3&{%TdR;;V$|oG_BlJZCXcv&B;6Ar^OE<_lrS(9eELMf zgyG9u;nCXAT*SIPJ{9;bBo_~@j8Z%MV=zqs;icrIKjlDE5g`UI^cdwDyG%IDlOZdgvrN*Wrg{A3UWTJmZ` zN)<-s{vI4@&ct-hG$-&HpOUDXWa|ZH9j+}tbVg*?7FbThq_%~{rpJ7~>Iv$aw&PCT zuyk^-=*kn2^ql{dF9b`qWWu&7EyyE8OPEJk+%cA>UWFeq>|BT#Q ziQ{KoQ&*?1rpC#|#bsjv;9{UNJ3c!4^HxFvcj;U{P$Re&osi*O+x6X$^KOC%j@h?P z#!TPR=?!b&9d?ArpKVcq*RT36%%jd1JG)#~BibZBwXrlcs-G*jYkUY^ zGF?LC&fg+lMvqP9L9mdPQreSGwU}%gE!h!jgyuKOD9Xb~CIkn{oyx3iYJ_ZX+9h>8JXSw&FE7R&l(;_c;)KxtK25N%xH?jMoe#3iHzBzk~H zjsS_>oUNAdCDH{U0%JI|iGQLWKIWa;p(jq=iE&L-ZCw=ayU$o!xMK%{@1V*uEoqKQ zpd~O=P{uuVRM=)Lvra}o4W;?8A^(VV7Sisni>uahb3U;J|8mI&D_R>?voX{-GPy9C zvNyy;cksnNq}51Uc@g<^%-!ytOSizveo>Z>LymjTYwCNgGlMk?T%Se8DNPmCRl%Va zi%lI3)zM0U-)psA+1=r=AyWq2%jvnRgS_%Fdqog~tT#>hN+otgSwg%No}OaH89dM++J6ls$ zQHaRxQ|r(?)0n9sE8QAaJNk=4%XwrVP1QPrk2OxIpG}Cir6N$Sd68RhznRW9XN*ks zRD)eqPjC}YKXgux&GlznmCqpAS=fq}*X>LGSvZfA4dII>T-ZfI}0>C{2| zXSM=`_Z$&V2jAVv7TUg4ha{BzR@`0^hk_rnbKR-o9E*0NZO+;-@R6^JbhOrPCYVVl zCt+%fvnh%#I0{QLF%?lGqcy?=>nIfxuTI-nT}dhO{Z0E^y0WUHY?LB>L*^3|9nVq= zKe4mz&w4aI4WE{S1u1Fgk2(eeM}m)W9nA^7wcc&?jMFBUNgfXs>WuTp?(~fh0RiSY zdjy{#tip}qi6d&iBaPd-qh`Exyh>P(cn&3I_*#`lMc5H~!eU4^X>L)&U4uNG38~+y zh-2>;YUEvl)`ZAM3$}A@|JvjlCMUMtRG&2N8W|cn zf+S+)$2I=hmA5C>t@{WfG2_?;wJyyVpk36t6e6CKTIW|*7m>Y5v4jRp)O5YH!S2Xl zP&j`}pAiU(|3av|CS24v+lPGN=$GO5A*?j4w+d$BYkbAr!#C*BfQn{{dv?MbFL`KU zjGYKe$qS|rphT-QEu+byp{XPtavQg|FXpbQ_p3}Z{AL_bochDRh{Y{AIXJ&CjtCC1 z_jcNL&vZ>lTt>>IoZN~U*%O-Le0zP6VH`g9!z+0XLx%gExs-lFb!~BVEzXIcTdYYl z;Ry!;uCq3SCe%YZELC=8X-Q(6&`)JX)2?-YqstLE$)i775-Yv_O3ccPZhEvO6cyGb zS{H3Qvv!0m4sYEbh8s?X|528~{XYHO^n@KmzurK8hIy-s?}j^qGmQSCEi)k$P|U-_ z6EjX@+$t?AJErgZa+CG;ybsRhuKxa;3VvWy{3o>zZ}9P((ERG^a~ju7oU$#m@A>HX zc+A9$gWzCIa7YOJ$B%!h9>Sf$pO~1NhqhgJ_RW>4CtxNh*UngO(5JD+vpf?xNeHVc ztjS4F>4-@x_R;^L&E-V;(aSKh!q~02F|)`y&Qg~LF&}Lut5!E7$f2Rlse?RV{MRJ+ zdr`%{A%Pt$0aieV#*dZc_uBr6hb{Oei{>B#KKrk30czuAx7Gx|kOpZb!_BOMuCSE3 zfrZ;LX6`wx$`I+?T-Qt=?R1m_LyRk z3stj|-Pxm-Q!Z)4LziLl^Bty~S6Sx8c%N&`rOnTdq+sQ6o-GLmF^zP$ zW={v~Q*{@AiSNJP)L60#ECA|MY0w(N43o^o%XAx^%}r>W(1FnH(!3Hdh%o9PGbges zqN!liMAet2Sd2a8Neat_ivCr2tZ3`UPkQjhfFFj}lproiqYzS;zgbsM#HQKC5Mt4`C15cah*qoN5E;D0QK_F;XW9H@Q5(^qL-Vu@)2Td+jIy+qYwzIYs399p0| zR7$q)E5o0z@ej-Sh#K_q!f@1$hRT=hx_Yc@X!K#D8Wwf~HToJQWnFYLsE&i0e z$^xIua=Qx*w5to~TRA>|ATeuTvx$V?KZ(jsBfxRTYnKcu4VtItMoZ56*Q(#w$uYvKYpxah9gblE_=AXcs<{m@Kk9FE5i3`q6@XDfau+ zp>p>6n37U}`)vLd)Hw7sgr%P-H+6VZ_BD)eoUE4T=y^T`RUa)bHz}YRiiKmgU#0cC ze>tqvKqU^9Dt#@FS3EKD96n^wA>bTu7K77zAL;K+-jTbYBrx~6Hl=mRWky&U*PCa{ zhW~mj%m|larUpN{Zdmw(hp5mG$C{DGhC;yinVAnD z*|495Ji&@uHjVRTU+B#c0r3^+&pK}pL{h9&zyrV`K>FjGL4f^HQh$FRyc}U!=4022 z78cVpf;7Ziw$4NhhtdTT(o60FDq4oNQ>URh?x%=_e&#Bj@n(tJe4P3Lwh7;{^`Wco zB?Vlf+wQ9M3SlQE3!cNDR+0?B*P-@W!<&^@2iFJY_@|XTgO@7g9`;&tvI|e0SN^N9 z$x$;-aY<>6JZ%8UA2hXXjm797FG_?#qc+y7p*SkA;UXX)h@h$V@0VEhz26K0;RiZw z^<#11OCA>X!Hb0JUneYgVz!@wq3YCr+X!-eadVT>kdo>~Z>W7>gnI_Zhu$vsC=@np zOqV2Inf}+Ge#Y$Nu2zM`jGOE|Tc6N3eqQVqG`S#(zjC^Z6HZdVD zCl?hFVbl@lb@GBQO*qokWw21Go2?dJXTbDMy1u@?d>$5f&~tG$0nhY3?Zh|TO1<`B z`@dab16x0bqV@hH6+X(A=_`hM*GenO_doKe@wDgu8a7{nE($QCdV5!=Nvf*MP{T0P zK@}kHlvH)hSVi_%W6yMNC+;T+Tf&S zhVn(A_mzD}>3j_DHUDsR`Q{*w=ahbZYJK|Wf4}?jmWC@IQT{-Sj)l`{2mZP{9mZ8t zbVn4yL$n{#m0LM5X96}h@bKZHAxQof{<_^0ou>Tr;@g|OA-^?T5(J8AHVg#VljRGo z^YvDwCPqfWy_8U)MG~w33u3qOC{JWZ>UCY$9xQQItoJi4zE3##)NyZc2h4m}P|Ou< zX5-DRt(R9bsAF^NeZj3Z-WV?C}9P#=yv`2a1Y9Ck6(9X6S9 zGZ~S{j$YR+5IBflN{OLBYxcBqs|~h?EMeF>m7i0Uf&T&o)!q4qevbC_`7p3t4=q2K zzC$tYNboa0Y{6OYa8CQbuU8p``7`P!V@LWNlMLb+JKrK-OeSaC>BFG)g`aVWww1H1 zYux|;l4lV_2i<+_Z1R#7ns*(!zOJtek$#-h89sX;)uE# z@}UP;!*HNc_7^-=n)=!5jn$MU*}gp7vY*iSX!;(=TGC>}JS6(-o-sW;6p zQgUzoDz~M>+?LIBfBmhgi4%8B1{dHJapOo&5Pd@kGiY29B{4xfl1Aq}^#sfEcB2x( zF`P#IlQwMoDH9c1GX?zO$>3uY7|4|6`DW`cDOKz^Vg=YpLhTxR=Z-K9e`)_o_7 zH=;3L4LPi1ySyu2mFYRvM{2bGEvPFDr*J>b_Ru>uKI7t!O4%7YlHI$7WZ! zO8Qmm;Eq#M8b5mRTn8hNYS?P^lo=zoMZjlf$(ZNdKRmU2KP7rG{ug)uCC_pn`cl`0 zQzoxUJ3n4N$##u%+`Tf?!z$dfBGk<)RNQRLvm*DrG1u4dpO5JX1<0?cMFqpQrX;+# z{F`~QgmS)aG;nwq4QslAv;KoMW>);-SrlHa>5xZpWjU)M8c)On&m~_(4~-|#G4pw@ zX!d?odKX?HDurYXXyg5?srf#mwCU@>5Kmqn=-c---2TF@#*+FY%#52N{MG(MPLq1jWF-rLZ&^%b4x5GCC*(!0``F(+<)d8#;Ar= zpK{3Im4*1)JeMfWTCt`1=&B!nxNU9d9~_j0>zt|+E9hL(_2+iq@^Ew3exp#6S1kKCIbY|GpR-za8mh<%3eg zY|SH4pQJPazg*oj+29d3F*R9r6ZOilhI)R%XwZSQ_OpwhL*u+?vahXUN?klRtf4{K z(QuDXc~$=DrG*%jNgDK1kNU2&?aF3u2d6wYvaez3Mt%B8mHpT?>ji7h`lv-FuNC%=1{nD8@kt8?fk>0F#u+>+(y_eeWJCGw z$!TCnR<^D5E3lzHm=z#vdyZjQ45%nQ$eK3GCNRP$hBOBHJ>(PDv@@)~QR$KUXk-cp znqE!E$zbI3@QuTaU+2rXP)ytw#Hc@|%b2e5;&=59P3!I6Hd2}7(uE!180$sGh*;`| z<^7-@T%B$Wp5e2V_#;VB&TV=JL&+}8qrA;XU)}UvTv6HD6`r;^(ny$1P^4KxbE$^0 z2!5h8Q3|;c2x2;wSABEwIqPYVevcLWZn!5=u~7=!oC1m?A#RLgrUI0j95Xg^GZh@+ zDgN~$mWQN11yu?!@28bG#3sn6DAahSzI;oU?AMsdYGi+miC}M&H}1DHez5DERzcuW zLE=(F;!>SbW}8;xo7Uo*Ruh_DXZC2dzjm5ASlX*`WH8>>8XRwm)a4fNIUMcVtwT?m zkk#RNVoaKlVRZlw7zMXO;lCP(;D5~vId(W7dBF>eDesfWDw~l;ei>))$T3Tm>_vzf zXfh*3v<;iMVJkW|TR#3mDVNsD2Vy~elLuY@D#U@7oV_S!3J>7|Mt!reFcTe(c z8y%DBn4hfO!O@|JQs_~h7?e3moIj1b=hk`@o~<*en*UsBZ-$JKi?+>)RFK5!O2L1^ zl{I0Ur$hr=yvwu#Er_fnwU1d67Vv>GV-W6m6a31j9VS;vr5|=9WSmvnF2w`#Ep??L{yO5U8$|Lj>FjXL$*sFO7qp5teO8&SvhVz7=^2yd=U;GN*zw4 zU1r5?dF+9!CB94~m;f_7{zys%~`Ou2**n;<{bmh2s z$)aT0vUJg$I*?zxm^QVHI+>O(nR*DHW*C?H&*8|((a^|IM|4zMIj4ZuwfXRnIZBLL zQi57sf>MINWT~Vmd3*tRJUQuB1oc-*YAH!tX(<|M>9Rk6iid|whlWarM#|`@&8cZE zX=yBI=qyS~4~k0;OGRgI*x9eyIj#X3ubxCJcv(xF7ljaAmrz+imrXzyAmSXoy&a*v z;YGdSp`fHiMd?0?;-i1UN8j$Hq--ptbTLJl`D6;LDzKvFIV(euf3GZiEH1e(tvoEQ zJS_3O|J^L!?D@sho#OtH9QM8l=AjVgF(1aQ=)tq_v9;*2xyVCwRjHk%vLurc@H$I~ zm3CqhXf&maTXW@Z6HWs16c$YNGujI@#G(QdjTn>GT9=1NnS^ zO~-vp59DxrxCj8i{wR;g&l+u*Qp9%i5O8`H+dA=la$iZa%w>AK7RPzOT7kIfAFIxx zvX9WWZG1AN#6{m@k<2)it>E|RTM@(6`6$SQH=1@9e->5D#S~{1PN<>TnE9ukjL0A# zo8y-MU;o&}+3%Py(c{z=1*<_BHcxvSzAL@00(}yB9g8*FpZ|U_Bg-3$W`yC>7&)@| zkhrsmlA%rf`;P0gryGriXqfHQo~+R8eO!12LAh_;Skt)(f^pD;&+wm_ANyQAo@Zk1 zr&6TLJn=;6GH3q|8XF= z_OiBTE4TJ%?ZDnZVIn1*m$|j2wI4g@EY~cPtgb+Q6kNt~s}?KAFFc3Rq=H}`)$o#F z6}F(7^tQPtVN8xMViC(pFnDkk7I~HwxS>{=yN{@$Lb=IGRs1WHhDUv+WoAIr%Abb8~C!>dUqEv7@Y~taDnr!gN9SX%WsQwzx0X>*{C>3_G04(K>+OZLhNNGH>AAbns-BGaURIyJaI?$}Bymvl>qW(iOX8|+v-XE;K7-da-y3;4T4n5hv8z3Xv(iV)I3^7d0% z{$dtel7pjieIl+Q6&2Zv#-#F3aZqboMKQ%A>pY%x#)I2pZJg{T$xdPF zhl|<`-c#5Fz+nHr)F!22SH;s#wm3MWdZwP%4%lGd8=pihv}nB)&U;5l6|&7H0q9tEv>U2j*O6a!(!u&D+SM4(9j9|P3Bh5Q+QoP*m_0^ zftfasvcMEJSMOcTdGq>F6LXAn!H|TE`CH+J55==7<>0jB>Y9jsEtPXC9|f!Mag%H( zFJYp|XyCHOQ9k{kbHg003WDH9CyLYC=f6>eY0h&n=kYZbs0o(Cc02N45%s&T<)Yl^yp& zy#jg}lN#-R>q}$8QV$)B={H_@xzMx2F~MU{n(l_<_p)^I*d-CLFcs|Gl{~ zVB0ruj5NLeGzIK{@+Q!+23S+&c#f?;G^)2WCjKs_w#%xzfw=Q{VHQ_T$P#RsjYWhl zvGr79SGm`*-i2qYFd;{VaW5s@SroXWuW49y9*ko6TC0;y-z4|sU&gci$IloP?^QJQ zwpC70bJfgfO-->ZbLhD4c)p7=(*r8GoB7HzzZ#SO__)HtfTCl<_DA4Wdev2Y0zY&G zo;w;7#js+mv13WM)y6s@;r=_1GqD??sGv1y-_$cX$}eWu-s0vz3K0qdNycwcp>qCG z;mBCVcmLgPX4aP-Xw%^|MFUlkLQq{rc}*qrVsuX9MOU82*K$;3Y;Dhi<3MQ<67lIR zST;17N_Nli@RQEjgE^hOp27D0solSoe5hZb#Ag7RsHino3aXTnHx88ZxSLf=veX$2 zBWD$F$HZ@2f_)#ea*oWndR$Hy9nwv2PCTdy3iN$mAKsofh`&JJa%MV_?L`>yO3*YJ3OY_@;jiXMzCW`F zb#r~@Q{JsC=_$(XQr7;q@kFu=gx|Hwm=}%~R=HLl91ss${fw#eVI-;A;rxTB|^IdT!J4!oNg_LLZ@Oy=| zVPlCRS&bP~f!2wgJ=smCh00iXiMj0huU6T0NEI8fK4K^FIq@mBYE_s4L=1ZXMZ=J zdnpBIB*er-ybjuMQNz&A0u^mE61|*J zb(QRvf`C8XS$HiFitC#R=W==Z{oCM!@D^%+O~k`%{ie`Xn8ke91FE?Jx#@c&k#$S? zs*q{hXT{dDFK*n_eh?`KVSezzkHz}scLB(Nn}Q{7)VQaVIIxv1a>*Lf`2A(ATPxVD z(3XB%JdN1Pg_qv5bvRORFN1eQ{;-u0(uHF>Me^DQ5jD;L^@&w7$h*aRxvFCNX6dH- z0rHU@F;K;>n`)uQfJ5WJ;|o&!h>AYO4o#2#Rdg9~vdHyoP*LEqMi<}pH3c zV-Hg8-)sN0bVm?HUSHsk9h;DYa|#`6w(Y4-pGWgjNLtD1XN`fZPu(FFwy?ztf(L)N z52n7kOyo7MVz3$M1-o<(st54dqB;uuP5P0yds4=9%&E9u^3T?C3;X$Y(%an-M617bV7~--|G*S8u^R z{VlbrtVP8{kpG0@J&F0EPTNcv6LS|>gJUSUzGV&f0JIWdr|T zj?6S>sr{BHcd4EgCvuc_wnM_y@CSn7F_NjdTf)!#C9LaZ`K}92%C=1jl$E%1ktuO>VkHRPfo} z<}x|COPeO{7m~QWolcgEQbYdlB0%Gy_Mo%*5jR`U>Yw3K`{`LrS3-luk;!zCfOVir zBNND_57PRm_a=s;920HnMBGLcJV5N_b6;JE%mzqU9quWkZC+qP1x`tnAAPMnSpCN% zdxKg$fwOrS;ioG;wq_yCbrCZ#VlV9VDOXSBhSGy*PMnCA{1&gQ~{) zr{I(S8{Qdv@V@a@P|tkTN)NnG$TxzTZTz}GBF}Moxhn!zbKpbJj`uXI_=v2i*Lw8~ zkLNcgV%OE1*#l<}7c63p2!<3N;p0+%{A7{;*a^v&;#|0{taL5x8Tr(sf@R+)*cezg z0^wXs7nW#{PGt3EH1H8sm6oX*9d+@p(6c235#^&syv>+b1#Wk%a*b)Es;*T`3m+DjGn%fRq%Py(Ai z=2AIYUrvv%(CSd>Q?_fveht1PC06=@4E>Qxk54ko$*D#Pdp_h$RyZ;gt9{Dmq82eRy&$#_mxe<8n&VieT z(3v&l$SN(b{LA=le6#3Z#8@}gUdVJ9i%hRm4Nmqz@PT9~!D9U(7iXf4!*qz#?EQ=p zf<*Q+>1RPAU8EA;@4l4--i@we-iF`&cxU8W=C_pf3|!B>{%-ZWhV-qNt}ji)*&LZ% zmPJtq|5t#ySIi~-lgo6a6DF;q0kvz+uP5vPoHU){yJp9E-vcQf*&NX&#ow?=0faW+ zxOp^uo^I5~1L;9UCC=LuSs-^Fh342BYDA;Jq{9B@F%;z|=}ZZ#Bz_23!ii+B6tBv8 zsN1{#F}ug^90Z{-t12gNhe*NCNQn{C>uq0C^0;DIc3%!@xeWc=`$ocXuG(ghpH639 zCo{cI_04u_HkN#*KL(ld1Dr9n~{kO789q~49-_kRDrKi~YE zb7s#u`|PvVTF-jcv&Nco0?TW$@=aQG7`NbxFgzuSc6Xo6YBRn|lE z;;%4E>!{2Itw_iewcue~GOa$jj6A0W4x6^t+`Yc9hw|E^SAet@pFpqP`Vll`n@KI+d#0l-B-pg3yog0_M62OqM9U|)uS?|hwyBJY_ir}oY9 z06tG(SXuvN-%dAel=`6mgp3{9(lo!TqB!k7|n$W%h(6I&?XGC}_|>@rzXr(9(BLsdKB*Kgo0YVew|lT!g%X6p>{aLC!Q$Y6|E!jg@Eg9Voq zB(6d*<2kI!K;DsA??@KE;npKon!m#`Qi1FJST&CDA*zZV_{_{dzKBUbN?%pg(wL*B z89!JxxJlGFg%tHxu?AW(n_oNoa+!3-pIzG<97p;_AQNNj6Ut*7X2MO~cn(ZeUD>U< zcCgcB&25@EU;kBKLJgu-c4tY|3Jml7m)Ff_8@qcGQ*Fc#g*}ZqmY+t|f~<~4!cF`RH{qBJelG{S~UI=g8-GQ1t# z50BYCvn?aiZZ2=6>)M1f(r^ejnvy#q%C)NeAALhn-37n~SNCMUYAdMQzqvRQ!g2>$ zk5(|TE!H*s7FjjA(A!?NjJ7_TI4xImaCMqLG2*lQG);}{~gu6W+Mo;L|v91snP2Ya*ePAolTLF-|r!f*p*l4C2F(PVk zw~^Bcw1kD6?hcLerLI1cqO4qP6M0R%?ruA~hRG4#7WJFA`<$K^ABOBGyJc)-o#%-q zuJ#u9=Q0zV{SEKz{#W8bdAUr8!SzVr=) z=h7*9rBP08*xB;x4&{&q|D1Scro-2R&gd%B(q0VdueL^4-?JDXZCX^YcJKdn9#aCV zWVlcTYe)CU-w-S#qzw|)I-HyPYkn>ca=gimn2I3FK!k;5gaS#_!ARNFGE6qunI6{C zi;uuTGhEDw?u_`QXShWxMI*$-qiGNhJDW-8-u@Pq*XMYLVp}uaFCmAk+1!SutGhM7 z;0_C?tJ2GiT8x{#bM946!U$k0cp)ZtPPU=Br4OE&`M(ZRKCLGfPNQ9w zAuA6_qv%$3;phc8Ao(y2_2ihrl0`0$ttt2YsU-A#VIf<)FTgscW`?Vg<+PPe0N$f2 z6;?vMS1ON{8cW?rYjv%Z1#^zcfs|dhGE-^gJ0{-@K@~&4`jJY^3whHbcs$+Pu+JU6 z;Dg~8M+tNFl&|I( zPS#52H2H|x_2xm%bf_f05)@MK69l{ufHk}wwhF2fg{i#_F`oRu(@4f1-J-XMJ6vmt zyN_7A5>q0c;NipWVa)sC2+G#+O*VQOtbO4H!fNu8stKM)*ZAF>H};8H7=fK*;I6~* zS<6zL<6&VRd*u_1zfZhf67;KqufJ-TnRISM*T`H(7onZ_L zehG-R(#%!EgsQ%+$n0?AyX}MBS|{r|M0%Kq4MEGGrm;A1CMe!zxU(0O45=*&y}BLW zg9o?M^@F%Ly89%P>kOM&nuJzLiT1CD*3~CO1|(j$`gZCo{pM(WEZ9*-t#k6OqD}Fj zytevmP;!k)`?a<-65N(GDo3_M&wZr`=HZd+c&0ZNE#D@B{9^j2y%ppwe4a>@5&(BW zD2neEkYIkK@1>~Ue^R($`#OimJ{D0kMqye8@2$ZqH2s>hfr^((yV+7wj_W?EVBzR8 zSvCFZ((OO5E=pmJaPrKDcq;0QVMMEPfX+xFbA^5uYSyA}7qQi)3y49vUQWjPhFQsIhKmCvk^E0)-R}@QDIr&&xS# zm@&=t*`Uoz5}Juaz|fVZ*K|UzMrS}eicV#v22r)?n;4BY(5uqnE$ZQ?Kn!!>%3(;PI0VbrG*cx`Y&jo5 zm^;ETS(on<*}M)|&R3bZ7dH|e?wZoCC)k4yp)D#@+8JXutY)!NgYV7kx!OwJXzRy) zNGOpnj3L+xyp0w^39ekmv(t>2o!M(L@qXO>iJ9&cLPZ=qmA4)=|46UA>oS5r^gRlB zwSAQ{dVhk!U6mR+>v!Zzf!UAV3E**8^R@VIFgm(?0Nj#h-yEC@#Ht>pQ6z5`=1xo;U<5?fQ(LzRxtAs z2oRV6%N}eyUhdPUYg=G|2W%ad(a#Vt>yd3V5 z19S;iCYdg&*Bj1|a9UI|W*eXGHwt0@zy#-)hG=Mh{JzcdW7;G)BVb|`ay^L!OstEM zX8F!paD6dil9DdRItD(rjyBG0Xm6r~cM7MhSr^!aPbJ7tUESFsfx(io?98~oF;c*t zjWYG!Hn*&Y3D01PMB6YzZfv&$uBywnIXc2qe?7p$F@w<2L zE{Ex$l^)&PM1aGEg@u`!nJ6IG73dS3mBj+I+`yT?pWmYNV#IBdtU)WF{O3%xeUFWu zo%QZrP_5DN=CAXK-F64eARs0tCMY-%!;;wu)WwdE9~?RmonyzR8{mr0BWyrjO1N~F z@m_jgEK)q#QKpL}A{$#lcmXzFH(USzIBoe1H;6t(NY%?Q`3m?vc?^~REN2w>AGR5< A1poj5 literal 0 HcmV?d00001 diff --git a/docs/source/user_guide/feature_guide/index.md b/docs/source/user_guide/feature_guide/index.md index c24faac..00f702a 100644 --- a/docs/source/user_guide/feature_guide/index.md +++ b/docs/source/user_guide/feature_guide/index.md @@ -10,4 +10,5 @@ quantization sleep_mode structured_output lora +eplb_swift_balancer ::: diff --git a/tests/ut/distributed/test_determin_expert_map_all.py b/tests/ut/distributed/test_determin_expert_map_all.py new file mode 100644 index 0000000..e69de29 diff --git a/vllm_ascend/ascend_config.py b/vllm_ascend/ascend_config.py index 6a61cdd..4fba304 100644 --- a/vllm_ascend/ascend_config.py +++ b/vllm_ascend/ascend_config.py @@ -43,8 +43,19 @@ class AscendConfig: "ascend_scheduler_config", {}) self.ascend_scheduler_config = AscendSchedulerConfig( ascend_scheduler_config) - + # Todo: Once https://github.com/vllm-project/vllm/issues/22246 is merged in vllm. Remove this config self.expert_map_path = additional_config.get("expert_map_path", None) + self.expert_map_record_path = additional_config.get( + "expert_map_record_path", + None) # Provide path to export expert map + self.init_redundancy_expert = additional_config.get( + "init_redundancy_expert", 0) + self.dynamic_eplb = additional_config.get("dynamic_eplb", False) + self.num_iterations_eplb_update = additional_config.get( + "num_iterations_eplb_update", 400) + self.gate_eplb = additional_config.get("gate_eplb", False) + self.num_wait_worker_iterations = additional_config.get( + "num_wait_worker_iterations", 30) self.chunked_prefill_for_mla = additional_config.get( "chunked_prefill_for_mla", False) self.enable_shared_expert_dp = additional_config.get( diff --git a/vllm_ascend/eplb/__init__.py b/vllm_ascend/eplb/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/vllm_ascend/eplb/adaptor/__init__.py b/vllm_ascend/eplb/adaptor/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/vllm_ascend/eplb/adaptor/abstract_adaptor.py b/vllm_ascend/eplb/adaptor/abstract_adaptor.py new file mode 100644 index 0000000..ab37fde --- /dev/null +++ b/vllm_ascend/eplb/adaptor/abstract_adaptor.py @@ -0,0 +1,44 @@ +# +# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# This file is a part of the vllm-ascend project. +# +# Todo: Once https://github.com/vllm-project/vllm/issues/22246 is merged in vllm. Remove this adaptor. +from abc import abstractmethod +from typing import Any + + +class EplbAdaptor(): + + def __init__(self, **args): + pass + + @abstractmethod + def get_rank_expert_workload(self): + raise NotImplementedError + + @abstractmethod + def get_init_expert_map(self, num_moe_layers: Any) -> Any: + raise NotImplementedError + + @abstractmethod + def do_update_expert_map(self, layer_id: Any, + updated_expert_map: Any) -> Any: + raise NotImplementedError + + @abstractmethod + def do_update_expert_weight(self, layer_id: Any, + local_expert_to_replace: Any, + buffer_tensor_id: Any) -> Any: + raise NotImplementedError diff --git a/vllm_ascend/eplb/adaptor/vllm_adaptor.py b/vllm_ascend/eplb/adaptor/vllm_adaptor.py new file mode 100644 index 0000000..d5ac509 --- /dev/null +++ b/vllm_ascend/eplb/adaptor/vllm_adaptor.py @@ -0,0 +1,289 @@ +# +# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# This file is a part of the vllm-ascend project. +# +# Todo: Once https://github.com/vllm-project/vllm/issues/22246 is merged in vllm. Remove this adaptor. +import json +from typing import Any + +import torch +import torch.distributed as dist +from vllm.logger import logger + +from vllm_ascend.ascend_config import get_ascend_config +from vllm_ascend.eplb.adaptor.abstract_adaptor import EplbAdaptor + + +class VllmEplbAdaptor(EplbAdaptor): + + def __init__(self, model, **args): + super().__init__(**args) + self.model = model + self.rank_id = dist.get_rank() + self.world_size = dist.get_world_size() + self.param_dict = dict(self.model.named_parameters()) + if self.model.config.model_type == "qwen3_moe": + self.num_dense_layers = 0 + self.global_expert_num = self.model.config.num_experts + else: + self.num_dense_layers = self.model.config.first_k_dense_replace + self.global_expert_num = self.model.config.n_routed_experts + self.num_moe_layers = self.model.config.num_hidden_layers - self.num_dense_layers + self.init_redundancy_expert = get_ascend_config( + ).init_redundancy_expert + + # TODO: init self.expert_weight_names depending on different model types, only deepseek v3 w8a8 and qwen3-moe is supported here + if self.model.quant_config is not None: + self.expert_weight_names = [ + "w13_weight", "w2_weight", "w13_weight_scale", + "w13_weight_offset", "w2_weight_scale", "w2_weight_offset" + ] + else: + self.expert_weight_names = ["w13_weight", "w2_weight"] + + self.expert_map_per_layer = dict( + ) # reference to expert map on device for expert map update + self.expert_map_per_layer_cpu = dict( + ) # copy of expert map on CPU to avoid device synchronize frequently + for layer_idx in range(self.num_moe_layers): + self.expert_map_per_layer[self.num_dense_layers + layer_idx] = \ + self.model.get_expert_map(self.num_dense_layers + layer_idx) + + # TODO: here we set number of buffer tensor equal to number of expert in each laryer, which can be improved + num_buffer_tensor = torch.where( + self.expert_map_per_layer[self.num_dense_layers] != -1)[0].numel() + self.buffer_tensor_list: list[list[Any]] = [ + [] for _ in range(num_buffer_tensor) + ] + self.init_buffer_tensor(num_buffer_tensor) + + self.expert_param_per_layer = dict() + self.init_expert_param_per_layer() + + self.log2phy_map_per_layer = dict() + for layer_idx in range(self.num_moe_layers): + self.log2phy_map_per_layer[self.num_dense_layers + layer_idx] = \ + self.model.get_log2phy_map(self.num_dense_layers + layer_idx) + + self.all_topk_ids = [] + + def init_buffer_tensor(self, num_buffer_tensor): + for name in self.expert_weight_names: + complete_name = "model.layers." + str( + self.num_dense_layers) + ".mlp.experts." + name + expert_tensor = self.param_dict[complete_name].data[ + 0:num_buffer_tensor] + buffer_tensors = torch.empty_like(expert_tensor) + for buffer_id in range(num_buffer_tensor): + self.buffer_tensor_list[buffer_id].append( + buffer_tensors[buffer_id]) + + def init_expert_param_per_layer(self): + num_local_expert = self.param_dict["model.layers." + str(self.num_dense_layers) + \ + ".mlp.experts." + self.expert_weight_names[0]].data.shape[0] + for moe_layer_id in range(self.num_moe_layers): + layer_idx = self.num_dense_layers + moe_layer_id + self.expert_param_per_layer[layer_idx] = list() + for local_expert_id in range(num_local_expert): + self.expert_param_per_layer[layer_idx].append([ + self.param_dict["model.layers." + str(layer_idx) + + ".mlp.experts." + + name].data[local_expert_id] + for name in self.expert_weight_names + ]) + + def get_rank_expert_workload(self) -> torch.Tensor: + self.moe_load = self.model.get_all_moe_loads() + return self.moe_load + + def get_init_expert_map(self, num_moe_layers): + expert_map = self.model.get_all_expert_map(num_moe_layers) + if dist.is_initialized(): + world_size = dist.get_world_size() + + gathered = torch.empty( + (world_size, *expert_map.shape), # [W, L, E] + dtype=expert_map.dtype, + device=expert_map.device) + + dist.all_gather_into_tensor(gathered, expert_map) + all_maps = gathered.permute(1, 0, 2) + all_expert_maps = all_maps.cpu() + + for layer_idx in range(num_moe_layers): + self.expert_map_per_layer_cpu[self.num_dense_layers + layer_idx] = \ + all_expert_maps[layer_idx][self.rank_id] + + return all_expert_maps + + def get_init_expert_map_from_file(self, num_moe_layers, expert_map_path): + + try: + expert_map_tensor, layers_num, ranks_num = self._expert_file_to_tensor( + expert_map_path) + expert_map_all = self.local2global(expert_map_tensor) + except (TypeError, FileNotFoundError, OSError): + expert_map_all = self.determine_expert_map_all() + + for layer_idx in range(num_moe_layers): + if self.model.config.model_type == "qwen3_moe": + self.expert_map_per_layer_cpu[layer_idx] = \ + expert_map_all[layer_idx][self.rank_id] + else: + self.expert_map_per_layer_cpu[layer_idx + self.num_dense_layers] = \ + expert_map_all[layer_idx][self.rank_id] + return expert_map_all + + def _expert_file_to_tensor(self, expert_map_path: str): + with open(expert_map_path, "r") as f: + data = json.load(f) + layers_num = data["moe_layer_count"] + gpus_num = data["layer_list"][0]["device_count"] + + tensor_data = [] + for layer in data["layer_list"]: + device_data = [] + for device in layer["device_list"]: + device_data.append(device["device_expert"]) + tensor_data.append(device_data) + expert_map_tensor = torch.tensor(tensor_data, dtype=torch.int32) + return expert_map_tensor, layers_num, gpus_num + logger.error(f"failed to read expert_map_path: {expert_map_path}") + + def _export_tensor_to_file(self, expert_maps, expert_map_record_path: str): + if self.rank_id == 0: + num_local_experts = expert_maps.max() + 1 + expert_maps_local = self.global2local(expert_maps, + num_local_experts) + + expert_maps_list = expert_maps_local.tolist() + record: dict[str, Any] = { + "moe_layer_count": len(expert_maps_list), + "layer_list": [] + } + + for layer_idx, layer_data in enumerate(expert_maps_list): + layer_record: dict[str, Any] = { + "layer_id": layer_idx, + "device_count": len(layer_data), + "device_list": [] + } + + for device_idx, experts in enumerate(layer_data): + device_record = { + "device_id": device_idx, + "device_expert": experts + } + layer_record["device_list"].append(device_record) + + record["layer_list"].append(layer_record) + + with open(expert_map_record_path, "w") as f: + json.dump(record, f, indent=4) + + def do_update_expert_map(self, layer_id, updated_expert_map): + self.expert_map_per_layer[layer_id] = updated_expert_map.clone() + self.expert_map_per_layer_cpu[layer_id] = updated_expert_map.clone() + + def do_update_expert_weight(self, layer_id, local_expert_to_replace, + buffer_tensor_id): + for expert_tensor, buffer_tensor in zip( + self.expert_param_per_layer[layer_id][local_expert_to_replace], + self.buffer_tensor_list[buffer_tensor_id]): + expert_tensor = buffer_tensor.clone() + logger.debug(f"Expert tensor shape is :{expert_tensor.shape}") + + def do_update_log2phy_map(self, layer_id, updated_log2phy_map): + if self.log2phy_map_per_layer[layer_id] is not None: + self.log2phy_map_per_layer[layer_id].copy_(updated_log2phy_map) + + def global2local(self, placement: torch.Tensor, + E_local: int) -> torch.Tensor: + + L, G, _ = placement.shape + device = placement.device + + pt_local = torch.full((L, G, E_local), + fill_value=-1, + dtype=torch.long, + device=device) + + valid = placement >= 0 + l_idx, g_idx, k_idx = valid.nonzero(as_tuple=True) + + slot_idx = placement[l_idx, g_idx, k_idx] + + pt_local[l_idx, g_idx, slot_idx] = k_idx + + return pt_local + + def local2global(self, placement_local: torch.Tensor) -> torch.Tensor: + + L, G, E_local = placement_local.shape + device = placement_local.device + + max_id = torch.max(placement_local) + E_global = (max_id + 1).item() if max_id >= 0 else 0 + + if E_global == 0: + return torch.empty((L, G, 0), dtype=torch.long, device=device) + + placement_global = torch.full((L, G, E_global), + fill_value=-1, + dtype=torch.long, + device=device) + + valid = placement_local >= 0 + l_idx, g_idx, slot_idx = valid.nonzero(as_tuple=True) + gid_idx = placement_local[l_idx, g_idx, slot_idx] + + placement_global[l_idx, g_idx, gid_idx] = slot_idx + + return placement_global + + def determine_expert_map_all(self): + if self.world_size == 1: + local_ids = torch.arange(self.global_expert_num, dtype=torch.int32) + return local_ids.view(1, 1, -1).expand(self.num_moe_layers, 1, -1) + + local_num_experts = self.global_expert_num // self.world_size + + expert_map_all = torch.full( + (self.num_moe_layers, self.world_size, self.global_expert_num), + -1, + dtype=torch.int32) + + for r in range(self.world_size): + if r < self.world_size - 1: + start = r * local_num_experts + end = (r + 1) * local_num_experts + local_count = local_num_experts + else: + start = r * local_num_experts + end = self.global_expert_num + local_count = self.global_expert_num - r * local_num_experts + + if r < self.init_redundancy_expert: + local_count += 1 + if end < self.global_expert_num: + end += 1 + else: + start -= 1 + + local_ids = torch.arange(local_count, dtype=torch.int32) + expert_map_all[:, r, start:end] = local_ids.unsqueeze(0).expand( + self.num_moe_layers, -1) + + return expert_map_all diff --git a/vllm_ascend/eplb/core/__init__.py b/vllm_ascend/eplb/core/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/vllm_ascend/eplb/core/eplb_device_transfer_loader.py b/vllm_ascend/eplb/core/eplb_device_transfer_loader.py new file mode 100644 index 0000000..a170987 --- /dev/null +++ b/vllm_ascend/eplb/core/eplb_device_transfer_loader.py @@ -0,0 +1,137 @@ +# +# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# This file is a part of the vllm-ascend project. +# +from enum import Enum + +import torch.distributed as dist +from vllm.logger import logger + + +class ExpertWeightUpdateState(Enum): + WAITING = 0 # waiting for updated expert_map by EplbWorker + READY = 1 # ready for d2d expert weights updating + TRANSFERRING = 2 # d2d finished and waiting for updating expert_map into model + + +class D2DExpertWeightLoader: + + def __init__(self): + self.comm_op_list = None + self.updated_expert_map = None + self.updated_log2phy_map = None + self.layer_id = -1 # layer id to be updated + self.state = ExpertWeightUpdateState.WAITING + self.recv_expert_list = [] + self.mock_flag = True + + def set_adator(self, eplb_adaptor): + self.eplb_adaptor = eplb_adaptor + + def generate_expert_d2d_transfer_task(self, expert_send_info, + expert_recv_info, updated_expert_map, + layer_id): + # When current send/recv and weight.expert_map update tasks are not finished, cannot accept new d2d task + if self.state != ExpertWeightUpdateState.WAITING: + logger.error( + "current d2d weight update tasks are on-going, cannot accept new weight update task" + ) + return + + # If neither send nor receive task is needed for this layer on this rank, return + if not (expert_send_info or expert_recv_info): + return + + self.updated_expert_map = updated_expert_map + + self.layer_id = layer_id + self.comm_op_list = [] + for send_info in expert_send_info: + dst_rank, global_expert_id_to_send = send_info + local_expert_id = self.eplb_adaptor.expert_map_per_layer_cpu[ + layer_id][global_expert_id_to_send].item() + for src_tensor in self.eplb_adaptor.expert_param_per_layer[ + layer_id][local_expert_id]: + self.comm_op_list.append( + dist.P2POp(dist.isend, src_tensor, dst_rank)) + + buffer_tensor_id = 0 + for recv_info in expert_recv_info: + recv_rank, global_expert_id_to_recv = recv_info + for buffer_tensor in self.eplb_adaptor.buffer_tensor_list[ + buffer_tensor_id]: + self.comm_op_list.append( + dist.P2POp(dist.irecv, buffer_tensor, recv_rank)) + local_expert_to_replace = self.updated_expert_map[ + global_expert_id_to_recv].item() + self.recv_expert_list.append( + (local_expert_to_replace, buffer_tensor_id)) + buffer_tensor_id += 1 + + self.state = ExpertWeightUpdateState.READY + + def set_log2phy_map(self, log2phy_map): + self.updated_log2phy_map = log2phy_map + + def asyn_expert_weight_transfer(self, reqs): + # Only when send/recv tasks are parsed into self.comm_op_list, d2d send/recv tasks can be luanched + if self.state != ExpertWeightUpdateState.READY: + return + + # set asynchronous stream for d2d expert weight transfer + if self.comm_op_list: + ret_list = dist.batch_isend_irecv(self.comm_op_list) + reqs.extend(ret_list) + + self.state = ExpertWeightUpdateState.TRANSFERRING + + def update_expert_map_and_weight(self, reqs): + # Only after send/recv tasks have been luanched, expert_map and weight can be updated + if self.state != ExpertWeightUpdateState.TRANSFERRING: + return + + # Waiting for send/recv tasks finish + for req in reqs: + req.wait() + + if self.comm_op_list is not None: + self.comm_op_list = None + + # update expert_map + self.eplb_adaptor.do_update_expert_map(self.layer_id, + self.updated_expert_map) + + # update log2phy_map + self.eplb_adaptor.do_update_log2phy_map(self.layer_id, + self.updated_log2phy_map) + + # update expert weight + buffer_tensor_id = 0 + for recv_expert_info in self.recv_expert_list: + local_expert_to_replace, buffer_tensor_id = recv_expert_info + self.eplb_adaptor.do_update_expert_weight(self.layer_id, + local_expert_to_replace, + buffer_tensor_id) + + logger.info( + f"[EPLB] finished update expert weight for layer: {self.layer_id}") + + self.recv_expert_list = [] + self.updated_expert_map = None + self.layer_id = -1 + self.state = ExpertWeightUpdateState.WAITING + + def load_impl(self, old_expert_table, new_expert_table): + raise NotImplementedError diff --git a/vllm_ascend/eplb/core/eplb_utils.py b/vllm_ascend/eplb/core/eplb_utils.py new file mode 100644 index 0000000..9a1c3bd --- /dev/null +++ b/vllm_ascend/eplb/core/eplb_utils.py @@ -0,0 +1,135 @@ +# +# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# This file is a part of the vllm-ascend project. +# +# Todo: Once https://github.com/vllm-project/vllm/issues/22246 is merged in vllm. Remove eplb utils. +import random + +import torch +from vllm.logger import logger + + +def determine_default_expert_map(global_expert_num, world_size, rank_id, + global_redundant_expert_num): + if world_size == 1: + local_ids = torch.arange(global_expert_num, dtype=torch.int32) + return (global_expert_num, local_ids) + + local_num_experts = global_expert_num // world_size + + expert_map = torch.full((global_expert_num, ), -1, dtype=torch.int32) + + if rank_id < world_size - 1: + start = rank_id * local_num_experts + end = (rank_id + 1) * local_num_experts + local_count = local_num_experts + else: + start = rank_id * local_num_experts + end = global_expert_num + local_count = global_expert_num - rank_id * local_num_experts + + if isinstance(global_redundant_expert_num, + int) and rank_id < global_redundant_expert_num: + local_count += 1 + if end < global_expert_num: + end += 1 + else: + start -= 1 + + if isinstance(local_count, int): + local_ids = torch.arange(local_count, dtype=torch.int32) + expert_map[start:end] = local_ids + + return (local_count, expert_map) + + +def generate_log2phy_map(expert_map): + num_local_experts = expert_map.max() + 1 + log2phy_map = expert_map.clone() + num_ranks, num_global_expert = log2phy_map.shape + + row_indices = torch.arange(num_ranks).view(-1, 1).expand(num_ranks, \ + num_global_expert) * num_local_experts + log2phy_map[log2phy_map != -1] += row_indices[log2phy_map != -1] + + for idx in range(num_global_expert): + positive_rank_idx = torch.where(log2phy_map[:, idx] != -1)[0] + negative_rank_idx = torch.where(log2phy_map[:, idx] == -1)[0] + num_rank_holding_expert = positive_rank_idx.size(0) + + if num_rank_holding_expert == 0: + log2phy_map[:, idx] = torch.full((num_ranks, ), + 0, + dtype=log2phy_map.dtype) + + if num_rank_holding_expert == 1: + log2phy_map[negative_rank_idx, idx] = torch.full( + (num_ranks - 1, ), + log2phy_map[positive_rank_idx, idx].item(), + dtype=log2phy_map.dtype) + else: + try: + random_list = [ + random.choice(log2phy_map[positive_rank_idx, idx]) + for _ in range(num_ranks - num_rank_holding_expert) + ] + log2phy_map[negative_rank_idx, + idx] = torch.tensor(random_list, + dtype=log2phy_map.dtype) + except Exception as e: + logger.error(f"Fail to get log2phy_map: {str(e)}") + + return log2phy_map + + +def determine_default_log2phy_map(global_expert_num, world_size, rank_id, + global_redundant_expert_num): + if world_size == 1: + local_ids = torch.arange(global_expert_num, dtype=torch.int32) + expert_map_all = local_ids.unsqueeze(0).expand(world_size, -1) + log2phy_map_all = generate_log2phy_map(expert_map_all) + return log2phy_map_all[rank_id] + + local_num_experts = global_expert_num // world_size + + expert_map_all = torch.full((world_size, global_expert_num), + -1, + dtype=torch.int32) + + for r in range(world_size): + if r < world_size - 1: + start = r * local_num_experts + end = (r + 1) * local_num_experts + local_count = local_num_experts + else: + start = r * local_num_experts + end = global_expert_num + local_count = global_expert_num - r * local_num_experts + + if isinstance(global_redundant_expert_num, + int) and rank_id < global_redundant_expert_num: + local_count += 1 + if end < global_expert_num: + end += 1 + else: + start -= 1 + + if isinstance(local_count, int): + local_ids = torch.arange(local_count, dtype=torch.int32) + expert_map_all[r, start:end] = local_ids + + log2phy_map_all = generate_log2phy_map(expert_map_all) + + return log2phy_map_all[rank_id] diff --git a/vllm_ascend/eplb/core/eplb_worker.py b/vllm_ascend/eplb/core/eplb_worker.py new file mode 100644 index 0000000..cd460f8 --- /dev/null +++ b/vllm_ascend/eplb/core/eplb_worker.py @@ -0,0 +1,436 @@ +# +# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# This file is a part of the vllm-ascend project. +# +from multiprocessing import Process, Queue +from typing import Any + +import networkx as nx # type: ignore +import numpy as np +import torch +import torch.distributed as dist +from vllm.logger import logger + +from vllm_ascend.eplb.core.eplb_utils import generate_log2phy_map +from vllm_ascend.eplb.core.policy.policy_factory import (DynamicConfig, + PolicyFactory) + + +class EplbWorker: + + def __init__(self, shared_dict, policy_type, enable_d2d: bool = True): + self.policy_type = policy_type + self.policy = PolicyFactory.generate_policy(policy_type, + DynamicConfig()) + self.shared_dict = shared_dict + self.old_expert_maps = None + self.enable_d2d = enable_d2d + self.rank_id = dist.get_rank() + + def do_update(self): + # put data in to queue + # in process self.policy.generate_policy() + # get epxert table && tensor + + # async stream + # D2D + # H2D + # Get initial expert_map + torch.set_num_threads(1) + if self.old_expert_maps is None: + self.old_expert_maps = self.get_init_expert_maps() + if self.old_expert_maps is not None: + self.num_local_experts = self.old_expert_maps.max() + 1 + else: + raise ValueError("Failed to get expert_maps from shared_dict.") + + # Get MOE load information + load_info = self.fetch_and_sum_load_info() + if load_info is None: + return + + # Get the updated expert table based on the workload information + old_placement = self.global2local(self.old_expert_maps, + self.num_local_experts) + _, _, new_placement = self.calculate_rebalance_experts( + load_info, old_placement) + + if not torch.is_tensor(new_placement): + new_placement = torch.tensor(new_placement) + self.check_expert_placement(old_placement, new_placement) + new_expert_maps = self.local2global(new_placement) + self.update_expert_map(new_expert_maps) + + update_info = self.compose_expert_update_info_greedy( + new_expert_maps, self.old_expert_maps) + self.old_expert_maps = new_expert_maps + logger.info("EPLB Process compute complete") + + packed_update_info = self.pack_update_info(update_info) + + return packed_update_info + + def check_expert_placement(self, old_placement, new_placement): + num_layers = old_placement.shape[0] + num_ranks = old_placement.shape[1] + + for layer_id in range(num_layers): + # check if any logical expert is not placed on any rank + if torch.unique(new_placement[layer_id]).numel() < torch.unique( + old_placement[layer_id]).numel(): + logger.error( + f"There exists expert not placed on any rank in layer {layer_id}" + ) + new_placement[layer_id] = old_placement[layer_id] + continue + + for rank_id in range(num_ranks): + new_placement_check = new_placement[layer_id][rank_id] + old_placement_check = old_placement[layer_id][rank_id] + + # check if same logical experts are placed on the same NPU + if new_placement_check.numel() != torch.unique( + new_placement_check).numel(): + logger.error( + f"Replicated experts are placed on the same NPU, expert placement on layer {layer_id}, rank {rank_id} is invalid" + ) + new_placement[layer_id] = old_placement[layer_id] + break + + # check if there is any experts movement inside one NPU + expert_not_move = torch.isin(new_placement_check, + old_placement_check) + if not torch.equal(new_placement_check[expert_not_move], + old_placement_check[expert_not_move]): + logger.error( + f"There exists expert movement inside NPU, expert placement on layer {layer_id}, rank {rank_id} is invalid" + ) + new_placement[layer_id] = old_placement[layer_id] + break + + def compose_expert_update_info_bipartite(self, updated_expert_maps_org, + current_expert_maps_org): + # transform numpy array to torch tensor + updated_expert_maps = updated_expert_maps_org.clone() + current_expert_maps = current_expert_maps_org.clone() + updated_expert_maps = np.array(updated_expert_maps) + current_expert_maps = np.array(current_expert_maps) + + num_layers = current_expert_maps.shape[0] + + for layer_id in range(num_layers): + updated_expert_maps_this_layer = updated_expert_maps[layer_id] + current_expert_maps_this_layer = current_expert_maps[layer_id] + updated_expert_maps_this_layer_org = updated_expert_maps_org[ + layer_id] + + from typing import Any + + expert_send_info_this_layer: dict[Any, Any] = {} + expert_recv_info_this_layer: dict[Any, Any] = {} + + # Guard Clause: if there is no expert weight update, avoid subsequent processing + if (np.equal(updated_expert_maps_this_layer, + current_expert_maps_this_layer)).all(): + yield (expert_send_info_this_layer, + expert_recv_info_this_layer, + updated_expert_maps_this_layer_org, layer_id) + + # Parse expert_ids each rank needs to receive from other ranks + dst_rank_indices, experts_to_recv = np.where( + (current_expert_maps_this_layer == -1) + & (updated_expert_maps_this_layer != -1)) + + # record src ranks for potential transfer + src_ranks_set = dict() + for idx in range(len(dst_rank_indices)): + expert_id = experts_to_recv[idx].item() + if expert_id not in src_ranks_set: + src_ranks_set[expert_id] = np.where( + current_expert_maps_this_layer[:, expert_id] != -1)[0] + + # loop until all experts are scheduled + while len(dst_rank_indices) > 0: + # construct bipartite graph + graph_expert_update: nx.Graph = nx.Graph() + for idx in range(len(dst_rank_indices)): + dst_rank_id = dst_rank_indices[idx].item() + expert_id = experts_to_recv[idx].item() + # add src ranks + src_rank_ids = src_ranks_set[expert_id] + graph_expert_update.add_nodes_from(src_rank_ids, + bipartite=0) + # add dest rank + graph_expert_update.add_node(str(dst_rank_id), bipartite=1) + # add edges + for src_rank_id in src_rank_ids: + graph_expert_update.add_edge(src_rank_id, + str(dst_rank_id)) + + # graph may not be connected + connected_components = list( + nx.connected_components(graph_expert_update)) + all_matches = {} + # matching in this loop + for i, component in enumerate(connected_components): + subgraph = graph_expert_update.subgraph(component) + component_matching = nx.bipartite.maximum_matching( + subgraph) + all_matches.update(component_matching) + + for src_rank, dst_rank in all_matches.items(): + dst_rank = int(dst_rank) + assert src_rank != dst_rank + if graph_expert_update.nodes[src_rank]['bipartite'] == 0: + # currently not scheduled experts in rank dst_rank + experts_v = experts_to_recv[np.where( + dst_rank_indices == dst_rank)] + # src: src_rank, dest: dst_rank, expert: expert_id + expert_id = np.intersect1d( + experts_v, + np.where(current_expert_maps_this_layer[src_rank] + != -1))[0] + + # record send/rcv pairs + if src_rank not in expert_send_info_this_layer: + expert_send_info_this_layer[src_rank] = [] + if dst_rank not in expert_recv_info_this_layer: + expert_recv_info_this_layer[dst_rank] = [] + expert_send_info_this_layer[src_rank].append( + (dst_rank, expert_id)) + expert_recv_info_this_layer[dst_rank].append( + (src_rank, expert_id)) + + remove_index = np.where( + np.logical_and(dst_rank_indices == dst_rank, + experts_to_recv == expert_id)) + + # update + dst_rank_indices = np.delete(dst_rank_indices, + remove_index) + experts_to_recv = np.delete(experts_to_recv, + remove_index) + + yield (expert_send_info_this_layer, expert_recv_info_this_layer, + updated_expert_maps_this_layer_org, layer_id) + + # TODO: Here only expert weight exchange is considered, need to be extended to cover other weight update cases + def compose_expert_update_info_greedy(self, updated_expert_maps, + current_expert_maps): + num_layers = current_expert_maps.shape[0] + for layer_id in range(num_layers): + updated_expert_maps_this_layer = updated_expert_maps[layer_id] + current_expert_maps_this_layer = current_expert_maps[layer_id] + + expert_send_info_this_layer: dict[Any, Any] = {} + expert_recv_info_this_layer: dict[Any, Any] = {} + + # Guard Clause: if there is no expert weight update, avoid subsequent processing + if torch.equal(updated_expert_maps_this_layer, + current_expert_maps_this_layer): + yield (expert_send_info_this_layer, + expert_recv_info_this_layer, + updated_expert_maps_this_layer, layer_id) + + # Parse expert_ids each rank needs to receive from other ranks + dst_rank_indices, experts_to_recv = torch.where((current_expert_maps_this_layer == -1) \ + & (updated_expert_maps_this_layer != -1)) + + # Parse expert_ids each rank needs to send to other ranks + src_rank_indices, experts_to_send = torch.where((current_expert_maps_this_layer != -1) \ + & (updated_expert_maps_this_layer == -1)) + + for idx in range(len(dst_rank_indices)): + dst_rank_id = dst_rank_indices[idx].item() + expert_id = experts_to_recv[idx].item() + if dst_rank_id not in expert_recv_info_this_layer: + expert_recv_info_this_layer[dst_rank_id] = [] + + if not torch.isin(torch.tensor(expert_id), + experts_to_send).any(): + # if expert_id are not sent out from any npu, it will be copied from one npu holding this expert + candidate_src_rank_indices = torch.where( + current_expert_maps_this_layer[:, expert_id] != -1)[0] + else: + candidate_src_rank_indices = src_rank_indices[ + experts_to_send == expert_id] + + # TODO: improve selection criterion of npu sending expert_id considering such as intra-node or inter-node... + src_rank_id = candidate_src_rank_indices[0].item() + if src_rank_id not in expert_send_info_this_layer: + expert_send_info_this_layer[src_rank_id] = [] + + expert_send_info_this_layer[src_rank_id].append( + (dst_rank_id, expert_id)) + expert_recv_info_this_layer[dst_rank_id].append( + (src_rank_id, expert_id)) + + yield (expert_send_info_this_layer, expert_recv_info_this_layer, + updated_expert_maps_this_layer, layer_id) + + def calculate_rebalance_experts(self, load_info, old_placement): + """ + Compute `new_map` by calling the `rebalance_experts` method of the policy instance. + """ + if self.old_expert_maps is None: + return False, None, None + + changed, priority, new_map = self.policy.rebalance_experts( + old_placement, load_info) + return changed, priority, new_map + + def get_init_expert_maps(self): + """ + Read the initial expert_map from shared_dict. + """ + return self.shared_dict.get("expert_maps", None) + + def fetch_and_sum_load_info(self): + """ + Each time the subprocess is awakened, read the latest moe_load + (shape: [num_moe_layers, num_experts_per_layer]) from shared_dict. + """ + return self.shared_dict.get("moe_load", None) + + def update_expert_map(self, expert_maps): + + self.shared_dict["expert_maps"] = expert_maps + + def global2local(self, placement: torch.Tensor, + E_local: int) -> tuple[torch.Tensor, torch.Tensor]: + + L, G, _ = placement.shape + device = placement.device + + pt_local = torch.full((L, G, E_local), + fill_value=-1, + dtype=torch.long, + device=device) + + valid = placement >= 0 + l_idx, g_idx, k_idx = valid.nonzero(as_tuple=True) + + slot_idx = placement[l_idx, g_idx, k_idx] + + pt_local[l_idx, g_idx, slot_idx] = k_idx + + return pt_local + + def local2global(self, placement_local: torch.Tensor) -> torch.Tensor: + + L, G, E_local = placement_local.shape + device = placement_local.device + + max_id = torch.max(placement_local) + E_global = (max_id + 1).item() if max_id >= 0 else 0 + + if E_global == 0: + return torch.empty((L, G, 0), dtype=torch.long, device=device) + + placement_global = torch.full((L, G, E_global), + fill_value=-1, + dtype=torch.long, + device=device) + + valid = placement_local >= 0 + l_idx, g_idx, slot_idx = valid.nonzero(as_tuple=True) + gid_idx = placement_local[l_idx, g_idx, slot_idx] + + placement_global[l_idx, g_idx, gid_idx] = slot_idx + + return placement_global + + def pack_update_info(self, update_info_generator): + """ + Pack a list of update info tuples for efficient IPC. + """ + send_all = [] + recv_all = [] + maps = [] + log2phy_all = [] + layer_ids = [] + + for send_info, recv_info, new_expert_map, layer_id in update_info_generator: + + send_info_this_rank = send_info[ + self.rank_id] if self.rank_id in send_info else [] + recv_info_this_rank = recv_info[ + self.rank_id] if self.rank_id in recv_info else [] + send_all.append(send_info_this_rank) + recv_all.append(recv_info_this_rank) + + maps.append(new_expert_map[self.rank_id].numpy().tolist()) + + log2phy_map = generate_log2phy_map(new_expert_map) + log2phy_all.append(log2phy_map[self.rank_id].numpy().tolist()) + + layer_ids.append(layer_id) + + return list(zip(send_all, recv_all, maps, log2phy_all, layer_ids)) + + +class EplbProcess: + + def __init__(self, + shared_dict, + policy_type: int = 0, + enable_d2d: bool = True): + """ + Args: + shared_dict: Cross-process shared dict returned by Manager().dict() + policy_type: Integer passed to PolicyFactory.generate_policy + enable_d2d: Whether to enable D2D loading + """ + self.shared_dict = shared_dict + self.policy_type = policy_type + self.enable_d2d = enable_d2d + self.planner_q: Queue[Any] = Queue() + self.block_update_q: Queue[Any] = Queue(maxsize=1) + + # Create EplbWorker instance + self.worker = EplbWorker(self.shared_dict, self.policy_type, + self.enable_d2d) + + def worker_process(self, planner_q, block_update_q): + """ + Subprocess entry: bind to specified NPU, loop waiting for planner_q to wake up, call do_update, then notify main process update is complete. + """ + while True: + try: + planner_q.get() + + packed_update_info = self.worker.do_update() + + while True: + if not block_update_q.empty(): + continue + block_update_q.put(packed_update_info) + break + + except Exception as e: + logger.warning(f"[EPLB subprocess Exiting due to error: {e}", + exc_info=True) + break + + def _launch_process(self): + """ + Use spawn method to launch subprocess and return (planner_q, block_update_q, proc). + """ + proc = Process(target=self.worker_process, + args=(self.planner_q, self.block_update_q), + daemon=True) + + proc.start() + return proc diff --git a/vllm_ascend/eplb/core/policy/__init__.py b/vllm_ascend/eplb/core/policy/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/vllm_ascend/eplb/core/policy/policy_abstract.py b/vllm_ascend/eplb/core/policy/policy_abstract.py new file mode 100644 index 0000000..8ef58e2 --- /dev/null +++ b/vllm_ascend/eplb/core/policy/policy_abstract.py @@ -0,0 +1,42 @@ +# Copyright Huawei Technologies Co., Ltd. 2023-2024. All rights reserved. +# Todo: Once https://github.com/vllm-project/vllm/pull/24069 is merged in vllm. Remove this policy. +from abc import abstractmethod + + +class DynamicConfig: + placement_policy = None + + max_transferred_expert_per_layer = 100 # Maximum number of experts that can be migrated per layer on a single host + ep_worldsize = 64 # Total number of dies across the entire cluster where experts are distributed + num_die_per_host = 8 # Number of dies on each host machine + + +class EplbPolicy: + + def __init__(self, config: DynamicConfig): + self.config = config + + @abstractmethod + def rebalance_experts(self, current_expert_table, expert_workload): + """ + Pass in the weights and return expert replication and placement under relevant constraints. + INPUT: + current_expert_table: [layerId, rankId, expert_num_i] + expert_workload = expert_table[layer0][rankId][expert_num_i] + + RETURNED: (res, expert_table) + res: + 1 -- table_changed + 0 -- not_changed + + expert_table: [layerId, rankId, expert_num_i] + expert_num_i --- [0, MaxExpertPerRank] + expertID = expert_table[layer0][rankId][expert_num_i] + array_values: + [0, 1, 2, 3, 248] + [4, 5, 6, 7, 254] + [8, 9, 10, 11, 71] + ... + [252, 253, 254, 255, 0] + """ + pass diff --git a/vllm_ascend/eplb/core/policy/policy_dynamic_ep.py b/vllm_ascend/eplb/core/policy/policy_dynamic_ep.py new file mode 100644 index 0000000..5e77f4d --- /dev/null +++ b/vllm_ascend/eplb/core/policy/policy_dynamic_ep.py @@ -0,0 +1,389 @@ +# Copyright Huawei Technologies Co., Ltd. 2024-2025. All rights reserved. +# Todo: Once https://github.com/vllm-project/vllm/pull/24069 is merged in vllm. Remove this policy. +from collections import defaultdict +from typing import cast + +import numpy as np + +from .policy_abstract import DynamicConfig, EplbPolicy + + +class DynamicTable: + # workload_table: + # 3D matrix: [layer, gpus, experts_per_gpu_per_layer] -> value: workload (heat) at the corresponding position + # Size: number of layers * number of GPUs * number of experts per GPU per layer + # The element at (i, j, k) represents the workload (heat) of the k-th expert on the j-th GPU in the i-th layer + # For experts that are not available or collected, the value is set to -1 + workload_table = None + + # placement_table: + # 3D matrix: [layer, gpus, experts_per_gpu_per_layer] -> value: physical expert ID at the corresponding position + # Size: number of layers * number of GPUs * number of experts per GPU per layer + # The element at (i, j, k) represents the physical expert ID of the k-th expert on the j-th GPU in the i-th layer + # For experts that are not available or collected, the value is set to -1 + placement_table = None + + +class DynamicEplb(EplbPolicy): + + def __init__(self, config: DynamicConfig): + super().__init__(config) + + @staticmethod + def add_redundant(current_expert_table, expert_workload, + num_original_expert): + layer_num, npu_num, experts_per_npu = expert_workload.shape + workload_new = np.zeros((layer_num, num_original_expert)) + for layer_idx in range(layer_num): + workload_dict: dict[int, int] = defaultdict(int) + placement_layer = current_expert_table[layer_idx].copy() + workload_layer = expert_workload[layer_idx].copy() + for npu_idx in range(npu_num): + for expert_idx in range(experts_per_npu): + workload_dict[placement_layer[npu_idx][ + expert_idx]] += workload_layer[npu_idx][expert_idx] + for expert_idx in range(num_original_expert): + workload_new[layer_idx][expert_idx] = workload_dict[expert_idx] + return workload_new + + @staticmethod + # Split hot (high-load) experts into redundant experts + def original_compute_balanced_pack_redundancy(origin_weights, card_num, + num_redundancy_expert): + # Step 1: Sort the items by weight in descending order (we are sorting by weight now) + # Sort based on the second element (the second value of each tuple) + route_expert_num = len(origin_weights) + route_expert_redundancy: list[list[int]] = [ + [] for _ in range(route_expert_num) + ] + for i in range(num_redundancy_expert): + sorted_indices = np.argsort([t[1] for t in origin_weights], + kind='stable')[::-1] + weights = [origin_weights[idx] for idx in sorted_indices] + tmp_raw_weight = weights[0][1] * ( + len(route_expert_redundancy[weights[0][0]]) + 1) + route_expert_redundancy[weights[0][0]].append(route_expert_num + i) + avg_weight = tmp_raw_weight / ( + len(route_expert_redundancy[weights[0][0]]) + 1) + weights[0] = (weights[0][0], avg_weight) + origin_weights = weights + + # Step 2: Calculate the number of items per box + expert_num = route_expert_num + num_redundancy_expert + items_per_box = expert_num // card_num # Number of items per box + remaining_items = expert_num % card_num # Number of items per box + + # Step 3: Initialize card_num boxes with empty lists to store item IDs + boxes: list[list[int]] = [[] for _ in range(card_num)] + boxes_weights: list[list[float]] = [[] for _ in range(card_num)] + box_weights = [0] * card_num # To store the total weight of each box + box_counts = [0] * card_num # To store the number of items in each box + index = 0 + for i in range(route_expert_num): + redundancy_num = len(route_expert_redundancy[i]) + for _ in range(redundancy_num): + cur_weight = 0 + for item, weight in origin_weights: + if item == i: + cur_weight = weight + + boxes[index].append(i) + boxes_weights[index].append(cur_weight) + box_weights[index] += cur_weight + box_counts[index] += 1 + index += 1 + + sorted_indices = np.argsort([t[1] for t in origin_weights], + kind='stable')[::-1] + origin_weights = [origin_weights[idx] for idx in sorted_indices] + # Step 4: Distribute items into boxes based on weight + for item_id, weight in origin_weights: + # Find the box with the least items but not full + min_box_index = -1 + for i in range(card_num): + if item_id in boxes[i]: + continue + # Only choose boxes that still have space (box_counts[i] < items_per_box) + if box_counts[i] < items_per_box or (box_counts[i] + == items_per_box + and remaining_items > 0): + if min_box_index == -1 or box_weights[i] < box_weights[ + min_box_index]: + min_box_index = i + + # Place the item (id) into the selected box + boxes[min_box_index].append(item_id) + boxes_weights[min_box_index].append(weight) + box_weights[min_box_index] += weight + box_counts[min_box_index] += 1 + + # If there's an imbalance in the remaining items, reduce the "remaining_items" counter + if box_counts[min_box_index] == (items_per_box + + 1) and remaining_items > 0: + remaining_items -= 1 + + # Step 5: Output each box's contents and total weight + result = [] + for i in range(card_num): + result.append({ + "box_index": i + 1, + "items": boxes[i], # List of item IDs in the box + "weight": boxes_weights[i], + "total_weight": box_weights[i], # Total weight in this box + "item_count": box_counts[i] # Number of items in the box + }) + + return result, boxes + + # Split hot (high-load) experts into redundant experts + @staticmethod + def compute_balanced_pack_redundancy(origin_weights, card_num, + num_redundancy_expert): + route_expert_num = len(origin_weights) + route_expert_redundancy: list[list[int]] = [ + [] for _ in range(route_expert_num) + ] + for i in range(num_redundancy_expert): + sorted_indices = np.argsort([t[1] for t in origin_weights], + kind='stable')[::-1] + weights = [origin_weights[idx] for idx in sorted_indices] + tmp_raw_weight = weights[0][1] * ( + len(route_expert_redundancy[weights[0][0]]) + 1) + route_expert_redundancy[weights[0][0]].append(route_expert_num + i) + avg_weight = tmp_raw_weight / ( + len(route_expert_redundancy[weights[0][0]]) + 1) + weights[0] = (weights[0][0], avg_weight) + origin_weights = weights + + expert_num = route_expert_num + num_redundancy_expert + if card_num == 0: + raise RuntimeError("card_num can not be 0.") + items_per_box = expert_num // card_num + remaining_items = expert_num % card_num + + boxes: list[list[int]] = [[] for _ in range(card_num)] + boxes_weights: list[list[float]] = [[] for _ in range(card_num)] + box_weights = [0] * card_num + box_counts = [0] * card_num + + all_weights = np.zeros((expert_num, ), dtype='object') + all_weights[:route_expert_num] = origin_weights + + index = route_expert_num + for i in range(route_expert_num): + redundancy_num = len(route_expert_redundancy[i]) + for _ in range(redundancy_num): + for item, weight in origin_weights: + if item == i: + all_weights[index] = (item, weight) + index += 1 + + sorted_indices = np.argsort([t[1] for t in all_weights], + kind='stable')[::-1] + all_weights = [all_weights[idx] for idx in sorted_indices] + for item_id, weight in all_weights: + min_box_index = -1 + for i in range(card_num): + if box_counts[i] < items_per_box or (box_counts[i] + == items_per_box + and remaining_items > 0): + if min_box_index == -1 or box_weights[i] < box_weights[ + min_box_index]: + if item_id not in boxes[i]: + min_box_index = i + + boxes[min_box_index].append(item_id) + boxes_weights[min_box_index].append(weight) + box_weights[min_box_index] += weight + box_counts[min_box_index] += 1 + + if box_counts[min_box_index] == (items_per_box + + 1) and remaining_items > 0: + remaining_items -= 1 + + result = [] + for i in range(card_num): + result.append({ + "box_index": i + 1, + "items": boxes[i], + "weight": boxes_weights[i], + "total_weight": box_weights[i], + "item_count": box_counts[i] + }) + + return result, boxes + + # Scheme without redundant experts + @staticmethod + def compute_balanced_pack(origin_weights, card_num): + sorted_indices = np.argsort([t[1] for t in origin_weights])[::-1] + weights = origin_weights[sorted_indices] + expert_num = len(weights) + if card_num == 0: + raise RuntimeError("card_num can not be 0.") + items_per_box = expert_num // card_num + remaining_items = expert_num % card_num + + boxes: list[list[int]] = [[] for _ in range(card_num)] + boxes_weights: list[list[float]] = [[] for _ in range(card_num)] + box_weights = [0] * card_num + box_counts = [0] * card_num + + for item_id, weight in weights: + min_box_index = -1 + for i in range(card_num): + if box_counts[i] < items_per_box or (box_counts[i] + == items_per_box + and remaining_items > 0): + if min_box_index == -1 or box_weights[i] < box_weights[ + min_box_index]: + min_box_index = i + + boxes[min_box_index].append(item_id) + boxes_weights[min_box_index].append(weight) + box_weights[min_box_index] += weight + box_counts[min_box_index] += 1 + + if box_counts[min_box_index] == (items_per_box + + 1) and remaining_items > 0: + remaining_items -= 1 + + result = [] + for i in range(card_num): + result.append({ + "box_index": i + 1, + "items": boxes[i], + "weight": boxes_weights[i], + "total_weight": box_weights[i], + "item_count": box_counts[i] + }) + + return result, boxes + + @staticmethod + def get_redundant_num(npu_num, counts): + redundant_num_each_npu: int = np.sum(counts - 1) + return redundant_num_each_npu + + @staticmethod + def calculate_max_heat_per_layer(workload_table, layer_num): + max_heat_per_layer: list[float] = [] + for layer_idx in range(layer_num): + npu_heats_now = np.sum(workload_table[layer_idx], axis=1) + max_heat_per_layer.append(np.max(npu_heats_now)) + return max_heat_per_layer + + @staticmethod + def constraint_expert_local_exchange(current_expert_table, + global_deployment): + for layer_id in range(len(global_deployment)): + for card_id in range(len(global_deployment[layer_id])): + current_list = [ + int(x) for x in current_expert_table[layer_id][card_id] + ] + new_list = [ + int(x) for x in global_deployment[layer_id][card_id] + ] + num = len(new_list) + + new_index = [-1] * num + new_result = [-1] * num + remaining_elements = [] + + for i in range(num): + flag = True + for j in range(num): + if new_list[i] == current_list[j] and new_index[ + j] == -1: + new_index[j] = 0 + new_result[j] = current_list[j] + flag = False + break + if flag: + remaining_elements.append(new_list[i]) + + index = 0 + for k in range(num): + if new_result[k] == -1: + new_result[k] = remaining_elements[index] + index += 1 + + global_deployment[layer_id][card_id] = new_result + + return global_deployment + + def rebalance_experts(self, current_expert_table, expert_workload): + + info = DynamicTable() + info.workload_table = np.array(expert_workload) + info.placement_table = np.array(current_expert_table) + assert info.workload_table is not None + layer_num, num_npus, experts_per_npu = info.workload_table.shape + assert info.placement_table is not None + row = cast(np.ndarray, info.placement_table[0]) + expert_ids, counts = np.unique(row, return_counts=True) + num_redundancy_expert = self.get_redundant_num(num_npus, counts) + num_original_expert = len(expert_ids) + layer_workloads = self.add_redundant(info.placement_table, + info.workload_table, + num_original_expert) + max_heat_per_layer_before = self.calculate_max_heat_per_layer( + info.workload_table, layer_num) + npu_heat_all_origin = sum(max_heat_per_layer_before) + + # Perform load balancing and deploy redundant experts + layer_num = layer_workloads.shape[0] + expert_num = layer_workloads.shape[1] + # Validate that the number of experts, number of cards, and number of redundant experts do not exceed the number of cards + if num_original_expert != expert_num: + raise ValueError( + f"the number of original experts {num_original_expert} must be equal to expert_num {expert_num}" + ) + + if num_npus <= 0: + raise ValueError("the number of NPUs must be greater than 0") + + if num_npus < num_redundancy_expert: + raise ValueError( + f"the number of NPUs {num_npus} must be greater than or equal to the number of redundant experts {num_redundancy_expert}" + ) + + # Number of experts deployed on each card includes one redundant expert + global_deployment: list[list[list[int]]] = [[[] + for _ in range(num_npus)] + for _ in range(layer_num)] + # Iterate to obtain the placement strategy for each layer, taking computational balance into account + max_heat_per_layer_after = np.zeros([layer_num]) + for layer in range(layer_num): + # Get the expert IDs and their corresponding workloads for the current layer; + # workloads need to be normalized, and one redundant expert is added per card + weights = np.zeros((expert_num, ), dtype='object') + for expert_id, workload_weight in enumerate( + layer_workloads[layer]): + weights[expert_id] = (expert_id, workload_weight) + + # Obtain the globally balanced placement strategy for each layer + result, layer_deployment = self.original_compute_balanced_pack_redundancy( + weights, num_npus, num_redundancy_expert) + + global_deployment[layer] = layer_deployment + max_heat_per_layer_after[layer] = max( + result, key=lambda x: x['total_weight'])['total_weight'] + + new_global_deployment = self.constraint_expert_local_exchange( + current_expert_table, global_deployment) + # Obtain the priority of each layer + layer_changed_ratio = [] + for layer_idx in range(layer_num): + layer_changed_ratio.append(max_heat_per_layer_after[layer_idx] / + max_heat_per_layer_before[layer_idx]) + + per_layer_priority = np.argsort(layer_changed_ratio) + npu_heat_all_after = sum(max_heat_per_layer_after) + + change = 0 + if npu_heat_all_after < 0.95 * npu_heat_all_origin: + change = 1 + + return change, per_layer_priority, np.array( + new_global_deployment).tolist() diff --git a/vllm_ascend/eplb/core/policy/policy_dynamic_ep_v2.py b/vllm_ascend/eplb/core/policy/policy_dynamic_ep_v2.py new file mode 100644 index 0000000..a0b8d5d --- /dev/null +++ b/vllm_ascend/eplb/core/policy/policy_dynamic_ep_v2.py @@ -0,0 +1,771 @@ +# Copyright Huawei Technologies Co., Ltd. 2024-2025. All rights reserved. +# Todo: Once https://github.com/vllm-project/vllm/pull/24069 is merged in vllm. Remove this policy. +from abc import abstractmethod +from collections import defaultdict + +import numpy as np + + +class DynamicConfig: + placement_policy = None + + max_transferred_expert_per_layer = 100 # Maximum number of experts that can be migrated per layer on a single host + ep_worldsize = 64 # Total number of dies across the entire cluster where experts are distributed + num_die_per_host = 8 # Number of dies on each host machine + + +class EplbPolicy: + + def __init__(self, config: DynamicConfig): + self.config = config + + @abstractmethod + def rebalance_experts(self, current_expert_table, expert_workload): + """ + Pass in the weights and return expert replication and placement under relevant constraints. + INPUT: + current_expert_table: [layerId, rankId, expert_num_i] + expert_workload = expert_table[layer0][rankId][expert_num_i] + + RETURNED: (res, expert_table) + res: + 1 -- table_changed + 0 -- not_changed + + expert_table: [layerId, rankId, expert_num_i] + expert_num_i --- [0, MaxExpertPerRank] + expertID = expert_table[layer0][rankId][expert_num_i] + array_values: + [0, 1, 2, 3, 248] + [4, 5, 6, 7, 254] + [8, 9, 10, 11, 71] + ... + [252, 253, 254, 255, 0] + """ + pass + + +class DynamicTable: + # workload_table: + # 3D matrix: [layer, gpus, experts_per_gpu_per_layer] -> value: workload (heat) at the corresponding position + # Size: number of layers * number of GPUs * number of experts per GPU per layer + # The element at (i, j, k) represents the workload (heat) of the k-th expert on the j-th GPU in the i-th layer + # For experts that are not available or collected, the value is set to -1 + workload_table = None + + # placement_table: + # 3D matrix: [layer, gpus, experts_per_gpu_per_layer] -> value: physical expert ID at the corresponding position + # Size: number of layers * number of GPUs * number of experts per GPU per layer + # The element at (i, j, k) represents the physical expert ID of the k-th expert on the j-th GPU in the i-th layer + # For experts that are not available or collected, the value is set to -1 + placement_table = None + + +class DynamicEplbV2(EplbPolicy): + + def __init__(self, config: DynamicConfig): + super().__init__(config) + + @staticmethod + def safe_divide(a, b): + if b == 0: + print("Division by zero is not allowed") + return 0 + return a / b + + @staticmethod + def safe_exact_divide(a, b): + if b == 0: + print("Division by zero is not allowed") + return 0 + return a // b + + @staticmethod + def safe_mod(a, b): + if b == 0: + print("Division by zero is not allowed") + return 0 + return a % b + + @staticmethod + def add_redundant(current_expert_table, expert_workload, + num_original_expert): + layer_num, npu_num, experts_per_npu = expert_workload.shape + workload_new = np.zeros((layer_num, num_original_expert)) + for layer_idx in range(layer_num): + workload_dict: dict[int, int] = defaultdict(int) + placement_layer = current_expert_table[layer_idx].copy() + workload_layer = expert_workload[layer_idx].copy() + for npu_idx in range(npu_num): + for expert_idx in range(experts_per_npu): + workload_dict[placement_layer[npu_idx][ + expert_idx]] += workload_layer[npu_idx][expert_idx] + for expert_idx in range(num_original_expert): + workload_new[layer_idx][expert_idx] = workload_dict[expert_idx] + return workload_new + + @staticmethod + def get_redundant_num(npu_num, counts): + redundant_num_each_npu: int = int(np.sum(counts - 1)) + return redundant_num_each_npu + + @staticmethod + def calculate_max_heat_per_layer(workload_table, layer_num): + max_heat_per_layer: list[float] = [] + for layer_idx in range(layer_num): + npu_heats_now = np.sum(workload_table[layer_idx], axis=1) + max_heat_per_layer.append(np.max(npu_heats_now)) + return max_heat_per_layer + + def calculate_initial_imbalance(self, global_deployment, + new_layer_workloads): + + device_num = global_deployment.shape[1] + layer_imbalance = [] + expert_num = np.zeros_like(new_layer_workloads) + for layer_id, layer in enumerate(global_deployment): + for device in layer: + for expert_id in device: + expert_num[layer_id][expert_id] += 1 + + for layer_id, layer in enumerate(global_deployment): + cur_layer_max_workload = 0 + total_workload = 0 + for box in layer: + box_workload = 0 + for expert_id in box: + update_workload = self.safe_divide( + new_layer_workloads[layer_id][expert_id], + expert_num[layer_id][expert_id]) + box_workload += update_workload + total_workload += update_workload + if cur_layer_max_workload < box_workload: + cur_layer_max_workload = box_workload + + cur_layer_imbalance = self.safe_divide( + cur_layer_max_workload, + (self.safe_divide(total_workload, device_num))) + layer_imbalance.append(cur_layer_imbalance) + + return layer_imbalance + + def compute_redundant_assignments(self, base_experts, + num_redundant_experts, num_experts): + + redundant_assignments: list[list[int]] = [[] + for _ in range(num_experts)] + current_weights = base_experts.copy() + + for i in range(num_redundant_experts): + sorted_indices = np.argsort([w for _, w in current_weights], + kind='stable')[::-1] + sorted_weights = [current_weights[i] for i in sorted_indices] + + target_expert = sorted_weights[0] + expert_id, original_weight = target_expert + + current_redundancy = len(redundant_assignments[expert_id]) + new_avg_weight = self.safe_divide( + original_weight * (current_redundancy + 1), + (current_redundancy + 2)) + + redundant_assignments[expert_id].append(num_experts + i) + current_weights[sorted_indices[0]] = (expert_id, new_avg_weight) + + sorted_indices = np.argsort([w for _, w in current_weights], + kind='stable')[::-1] + sorted_weights = [current_weights[i] for i in sorted_indices] + + return redundant_assignments, sorted_weights + + def repeat_compute_redundant_assignments(self, layer_workloads, rendun_pos, + num_experts, num_exist_expert, + device_assignments, device_counts, + expert_from_device, + com_between_devices): + + current_weights = np.zeros((num_experts, ), dtype='object') + for expert_id, workload_weight in enumerate(layer_workloads): + current_weights[expert_id] = (expert_id, workload_weight) + + devices_with_slots = [] + for device_id, device_rendun_pos in enumerate(rendun_pos): + if len(device_rendun_pos) != 0: + devices_with_slots.append(device_id) + + while devices_with_slots: + sorted_indices = np.argsort([w for _, w in current_weights], + kind='stable')[::-1] + sorted_weights = [current_weights[i] for i in sorted_indices] + + for index, target_weight in enumerate(sorted_weights): + expert_id, original_weight = target_weight + if original_weight == -1: + print("Error:Redundant expert failure re-occurred") + redundancy_successful = True + break + redundancy_successful = False + for cur_device_id in devices_with_slots: + if expert_id not in device_assignments[cur_device_id]: + pos = rendun_pos[cur_device_id].pop() + if len(rendun_pos[cur_device_id]) == 0: + devices_with_slots = [ + device_id for device_id in devices_with_slots + if device_id != cur_device_id + ] + device_assignments[cur_device_id][pos] = expert_id + device_counts[cur_device_id] += 1 + communication_box_index = expert_from_device[expert_id] + com_between_devices[cur_device_id][ + communication_box_index] = expert_id + new_weight = self.safe_divide( + (original_weight * num_exist_expert[expert_id]), + (num_exist_expert[expert_id] + 1)) + sorted_weights[index] = (expert_id, new_weight) + num_exist_expert[expert_id] += 1 + redundancy_successful = True + break + if redundancy_successful: + break + + sorted_indices = np.argsort([id for id, _ in sorted_weights], + kind='stable') + sorted_weights = [sorted_weights[i][1] for i in sorted_indices] + + return sorted_weights, device_assignments, device_counts, com_between_devices + + @staticmethod + def prepare_expert_list(base_experts, redundant_assignments, + num_redundant_experts): + redundant_expert_list = np.empty(num_redundant_experts, dtype=object) + + index = 0 + num_experts = len(redundant_assignments) + for expert_id in range(num_experts): + for _ in redundant_assignments[expert_id]: + redundant_expert_list[index] = (expert_id, + next(w + for eid, w in base_experts + if eid == expert_id)) + index += 1 + + sorted_indices = np.argsort([w for _, w in redundant_expert_list], + kind='stable')[::-1] + return [redundant_expert_list[i] for i in sorted_indices] + + @staticmethod + def non_redundant_expert_information(origin_deployment, updated_weights, + rendun_pos): + + device_num = len(origin_deployment) + num_experts_per_device = origin_deployment.shape[1] + device_assignments = [[-1 for _ in range(num_experts_per_device)] + for _ in range(device_num)] + device_weights = [[0 for _ in range(num_experts_per_device)] + for _ in range(device_num)] + device_loads = [0] * device_num + device_counts = [0] * device_num + + for device_id, device in enumerate(origin_deployment): + for index, expert_id in enumerate(device): + if index in rendun_pos[device_id]: + continue + device_assignments[device_id][index] = expert_id + cur_weight = next( + weight for expert_id_of_weight, weight in updated_weights + if expert_id_of_weight == expert_id) + device_weights[device_id][index] = cur_weight + device_loads[device_id] += cur_weight + device_counts[device_id] += 1 + + return device_assignments, device_weights, device_loads, device_counts + + def recomputing_initial_weight(self, layer_workloads, device_assignments): + num_all_experts = [0] * len(layer_workloads) + for device in device_assignments: + for expert_id in device: + if expert_id != -1: + num_all_experts[expert_id] += 1 + + cur_layer_workload = [] + for expert_id, weight in enumerate(layer_workloads): + if num_all_experts[expert_id] == 0: + cur_layer_workload.append(-1) + else: + cur_layer_workload.append( + self.safe_divide(weight, num_all_experts[expert_id])) + + return cur_layer_workload, num_all_experts + + def distribute_redun_experts(self, layer_workloads, device_assignments, + device_weights, device_loads, device_counts, + redundant_expert_list, expert_from_device, + num_experts, rendun_pos): + + num_devices = len(device_assignments) + com_between_devices: list[dict[int, + int]] = [{} for _ in range(num_devices)] + + for expert_id, weight in redundant_expert_list: + candidate = -1 + for dev_id in range(num_devices): + if len(rendun_pos[dev_id]) == 0: + continue + if expert_id in device_assignments[dev_id]: + continue + if candidate == -1 or device_loads[dev_id] < device_loads[ + candidate]: + candidate = dev_id + if candidate != -1: + pos = rendun_pos[candidate].pop() + device_assignments[candidate][pos] = expert_id + device_weights[candidate][pos] = weight + device_loads[candidate] += weight + device_counts[candidate] += 1 + + communication_box_index = expert_from_device[expert_id] + com_between_devices[candidate][ + communication_box_index] = expert_id + + if any(sublist for sublist in rendun_pos): + cur_layer_workload, num_exist_expert = self.recomputing_initial_weight( + layer_workloads, device_assignments) + + update_workload, device_assignments, device_counts, com_between_devices = self.repeat_compute_redundant_assignments( + cur_layer_workload, rendun_pos, num_experts, num_exist_expert, + device_assignments, device_loads, expert_from_device, + com_between_devices) + + device_loads = [0] * len(device_counts) + for device_id, device in enumerate(device_assignments): + for index, expert_id in enumerate(device): + device_weights[device_id][index] = update_workload[ + expert_id] + device_loads[device_id] += update_workload[expert_id] + + return device_assignments, device_weights, device_loads, device_counts, com_between_devices + + def redundancy_again(self, layer_workloads, origin_weights, + origin_deployment, expert_from_device, num_node, + is_node_redundant, rendun_pos): + + num_experts = len(origin_weights) + if is_node_redundant: + num_experts = num_experts * num_node + + num_redundant_experts = 0 + for rank_empty_pos in rendun_pos: + num_redundant_experts += len(rank_empty_pos) + + redundant_assignments, updated_weights = self.compute_redundant_assignments( + origin_weights, num_redundant_experts, num_experts) + + redundant_expert_list = self.prepare_expert_list( + updated_weights, redundant_assignments, num_redundant_experts) + + device_assignments, device_weights, device_loads, device_counts = self.non_redundant_expert_information( + origin_deployment, updated_weights, rendun_pos) + + device_assignments, device_weights, device_loads, device_counts, com_between_devices = self.distribute_redun_experts( + layer_workloads, device_assignments, device_weights, device_loads, + device_counts, redundant_expert_list, expert_from_device, + num_experts, rendun_pos) + + return device_assignments, device_weights, device_loads, device_counts, com_between_devices + + @staticmethod + def generate_allocation_report(device_assignments, device_weights, + device_loads, device_counts): + + report = [] + max_load = 0.0 + + for dev_id in range(len(device_assignments)): + current_load = device_loads[dev_id] + max_load = max(max_load, current_load) + + report.append({ + "device_id": dev_id + 1, + "assigned_experts": device_assignments[dev_id], + "expert_weights": device_weights[dev_id], + "total_load": current_load, + "expert_count": device_counts[dev_id] + }) + + return report, max_load + + @staticmethod + def exchange_expert(cur_exchange_index, next_exchange_index, cur_device_id, + next_device_id, cur_layer_result, com_between_devices): + + cur_device_deployment = cur_layer_result[cur_device_id][ + 'assigned_experts'] + next_device_deployment = cur_layer_result[next_device_id][ + 'assigned_experts'] + + cur_device_weight = cur_layer_result[cur_device_id]['expert_weights'] + next_device_weight = cur_layer_result[next_device_id]['expert_weights'] + + cur_expert_id = cur_device_deployment[cur_exchange_index] + next_expert_id = next_device_deployment[next_exchange_index] + cur_device_deployment[cur_exchange_index] = next_expert_id + next_device_deployment[next_exchange_index] = cur_expert_id + + cur_expert_weight = cur_device_weight[cur_exchange_index] + next_expert_weight = next_device_weight[next_exchange_index] + cur_device_weight[cur_exchange_index] = next_expert_weight + next_device_weight[next_exchange_index] = cur_expert_weight + + cur_layer_result[cur_device_id][ + 'total_load'] += next_expert_weight - cur_expert_weight + cur_layer_result[next_device_id][ + 'total_load'] += cur_expert_weight - next_expert_weight + + com_between_devices[cur_device_id][next_device_id] = next_expert_id + com_between_devices[next_device_id][cur_device_id] = cur_expert_id + + def redundant_expert_deployment(self, layer_workloads, original_deployment, + expert_from_device, node_num, + is_node_redundant, rendun_pos): + device_num, per_device_expert_num = original_deployment.shape + route_expert_num = layer_workloads.shape[0] + per_node_device_num = self.safe_exact_divide(device_num, node_num) + per_node_route_expert_num = per_node_device_num * ( + per_device_expert_num - 1) + + weights = np.zeros((route_expert_num, ), dtype='object') + for expert_id, workload_weight in enumerate(layer_workloads): + weights[expert_id] = (expert_id, workload_weight) + + if is_node_redundant: + + device_assignments = [] + device_weights = [] + device_loads = [] + device_counts = [] + com_between_devices = [] + + for node_id in range(node_num): + cur_node_weights = weights[node_id * + per_node_route_expert_num:(node_id + + 1) * + per_node_route_expert_num] + cur_original_deployment = original_deployment[ + node_id * per_node_device_num:(node_id + 1) * + per_node_device_num] + + cur_node_rendun_pos = rendun_pos[node_id * + per_node_device_num:(node_id + + 1) * + per_node_device_num] + + cur_device_assignments, cur_device_weights, cur_device_loads, cur_device_counts, cur_com_between_devices = self.redundancy_again( + layer_workloads, cur_node_weights, cur_original_deployment, + expert_from_device, node_num, is_node_redundant, + cur_node_rendun_pos) + device_assignments += cur_device_assignments + device_weights += cur_device_weights + device_loads += cur_device_loads + device_counts += cur_device_counts + com_between_devices += cur_com_between_devices + + else: + device_assignments, device_weights, device_loads, device_counts, com_between_devices = self.redundancy_again( + layer_workloads, weights, original_deployment, + expert_from_device, node_num, is_node_redundant, rendun_pos) + report, max_load = self.generate_allocation_report( + device_assignments, device_weights, device_loads, device_counts) + + return report, max_load, com_between_devices + + @staticmethod + def two_device_exchange_experts(cur_device_result, exchange_device_result, + cur_exchanged_expert_id, + next_exchanged_expert_id, ave_workload, + increment, num_redundancy_expert): + + cur_device_weight = cur_device_result['expert_weights'] + next_device_weight = exchange_device_result['expert_weights'] + + cur_device_expert_id = cur_device_result['assigned_experts'] + next_device_expert_id = exchange_device_result['assigned_experts'] + + cur_device_total_weight = cur_device_result['total_load'] + next_device_total_weight = exchange_device_result['total_load'] + max_weight = max(cur_device_total_weight, next_device_total_weight) + + cur_exchange_index = -1 + next_exchange_index = -1 + + for index, weight in enumerate(cur_device_weight): + for next_index, next_weight in enumerate(next_device_weight): + change_flag = True + if (cur_device_expert_id[index] in next_device_expert_id + or next_device_expert_id[next_index] + in cur_device_expert_id): + change_flag = False + if (cur_device_expert_id[index] not in cur_exchanged_expert_id + ) and (next_device_expert_id[next_index] + not in next_exchanged_expert_id) and change_flag: + + cur_total_weight_after_exchange = cur_device_total_weight - weight + next_weight + next_total_weight_after_exchange = next_device_total_weight - next_weight + weight + exchange_max_weight = max( + cur_total_weight_after_exchange, + next_total_weight_after_exchange) + if exchange_max_weight < max_weight and ( + max_weight - + exchange_max_weight) >= (ave_workload * increment): + max_weight = exchange_max_weight + cur_exchange_index = index + next_exchange_index = next_index + + return cur_exchange_index, next_exchange_index + + def expert_exchange_between_devices(self, + ave_workload, + increment, + cur_layer_result, + com_between_devices, + num_redundancy_expert, + node_idx=0, + per_node_device_num=0, + is_node_redundant=False): + + if is_node_redundant: + cur_devices_result = cur_layer_result[node_idx * + per_node_device_num: + (node_idx + 1) * + per_node_device_num] + else: + cur_devices_result = cur_layer_result + + devices_total_weight = [] + for device in cur_devices_result: + devices_total_weight.append( + (device['total_load'], device['device_id'] - 1)) + + exchange_frequency = 100 + while exchange_frequency > 0: + exchange_frequency -= 1 + devices_total_weight.sort(key=lambda x: x[0]) + max_weight_device_id = devices_total_weight[-1][1] + exchange = False + for index in range(0, len(devices_total_weight) - 1): + min_weight_device_id = devices_total_weight[index][1] + if min_weight_device_id not in com_between_devices[ + max_weight_device_id]: + cur_exchanged_expert_id = list( + com_between_devices[max_weight_device_id].values()) + next_exchanged_expert_id = list( + com_between_devices[min_weight_device_id].values()) + + cur_exchange_index, next_exchange_index = self.two_device_exchange_experts( + cur_layer_result[max_weight_device_id], + cur_layer_result[min_weight_device_id], + cur_exchanged_expert_id, next_exchanged_expert_id, + ave_workload, increment, num_redundancy_expert) + + if cur_exchange_index != -1: + self.exchange_expert(cur_exchange_index, + next_exchange_index, + max_weight_device_id, + min_weight_device_id, + cur_layer_result, + com_between_devices) + + devices_total_weight[-1] = ( + cur_layer_result[max_weight_device_id] + ['total_load'], max_weight_device_id) + devices_total_weight[index] = ( + cur_layer_result[min_weight_device_id] + ['total_load'], min_weight_device_id) + exchange = True + break + + if not exchange: + break + + def exchange_experts(self, layer_result, layer_com_between_devices, + num_nodes, device_num, is_node_redundant, + ave_workload, increment, num_redundancy_expert, + org_deployment): + + global_deployment = [] + + if is_node_redundant: + per_node_device_num = self.safe_exact_divide(device_num, num_nodes) + for node_idx in range(num_nodes): + self.expert_exchange_between_devices( + ave_workload, increment, layer_result, + layer_com_between_devices, num_redundancy_expert, node_idx, + per_node_device_num, is_node_redundant) + else: + self.expert_exchange_between_devices(ave_workload, increment, + layer_result, + layer_com_between_devices, + num_redundancy_expert) + + max_workload = 0 + for box in layer_result: + global_deployment.append(box['assigned_experts']) + if max_workload < box['total_load']: + max_workload = box['total_load'] + + global_deployment = np.array(global_deployment) + + return global_deployment, max_workload + + def count_elements(self, lst): + count = 0 + for item in lst: + if isinstance(item, list): + count += self.count_elements(item) + else: + count += 1 + return count + + @staticmethod + def constraint_expert_local_exchange(current_expert_table, + global_deployment): + for layer_id in range(len(global_deployment)): + for card_id in range(len(global_deployment[layer_id])): + current_list = [ + int(x) for x in current_expert_table[layer_id][card_id] + ] + new_list = [ + int(x) for x in global_deployment[layer_id][card_id] + ] + num = len(new_list) + + new_index = [-1] * num + new_result = [-1] * num + remaining_elements = [] + + for i in range(num): + flag = True + for j in range(num): + if new_list[i] == current_list[j] and new_index[ + j] == -1: + new_index[j] = 0 + new_result[j] = current_list[j] + flag = False + break + if flag: + remaining_elements.append(new_list[i]) + + index = 0 + for k in range(num): + if new_result[k] == -1: + new_result[k] = remaining_elements[index] + index += 1 + + global_deployment[layer_id][card_id] = new_result + + return global_deployment + + def rebalance_experts(self, + current_expert_table, + expert_workload, + is_node_redundant=False, + increment=0.01): + info = DynamicTable() + info.workload_table = expert_workload.numpy() + info.placement_table = current_expert_table.numpy() + assert info.workload_table is not None + layer_num, num_npus, experts_per_npu = info.workload_table.shape + expert_ids, counts = np.unique(info.placement_table[0], + return_counts=True) + num_redundancy_expert = self.get_redundant_num(num_npus, counts) + num_original_expert = len(expert_ids) + layer_workloads = self.add_redundant(info.placement_table, + info.workload_table, + num_original_expert) + max_heat_per_layer_before = self.calculate_max_heat_per_layer( + info.workload_table, layer_num) + npu_heat_all_origin = sum(max_heat_per_layer_before) + + num_node = self.safe_exact_divide(num_npus, 8) + layer_num = layer_workloads.shape[0] + expert_num = layer_workloads.shape[1] + expert_from_device = np.zeros((layer_num, num_original_expert)) + + if num_original_expert != expert_num: + raise ValueError( + f"The number of original experts ({num_original_expert}) must match expert_num ({expert_num})" + ) + + if num_npus <= 0: + raise ValueError("The number of NPUs must be greater than 0") + + if num_npus < num_redundancy_expert: + raise ValueError( + f"The number of NPUs ({num_npus}) must be greater than or equal to the number of redundant experts ({num_redundancy_expert})" + ) + + global_deployment: list[list[list[int]]] = [[[] + for _ in range(num_npus)] + for _ in range(layer_num)] + layer_initial_imbalance = self.calculate_initial_imbalance( + info.placement_table, layer_workloads) + max_heat_per_layer_after = np.zeros([layer_num]) + sum_num = 0 + for layer in range(layer_num): + # print(f"Load imbalance ratio of layer {layer} under the new workload", layer_initial_imbalance[layer]) + if layer_initial_imbalance[layer] < 1.01: + global_deployment[layer] = info.placement_table[layer] + continue + + ave_workload = self.safe_divide(np.sum(layer_workloads[layer]), + num_npus) + + rendun_pos: list[list[int]] = [[] for _ in range(num_npus)] + existing_experts = set() + for device_id, device in enumerate(info.placement_table[layer]): + for index, expert_id in enumerate(device): + if expert_id not in existing_experts: + existing_experts.add(expert_id) + expert_from_device[layer][expert_id] = device_id + else: + rendun_pos[device_id].append(index) + + result, max_workload, com_between_devices = self.redundant_expert_deployment( + layer_workloads[layer], info.placement_table[layer], + expert_from_device[layer], num_node, is_node_redundant, + rendun_pos) + # print(layer, f"Imbalance Ratio after Redundancy Adjustment:", self.safe_divide(max_workload, ave_workload)) + + global_deployment[layer], new_max_workload = self.exchange_experts( + result, com_between_devices, num_node, num_npus, + is_node_redundant, ave_workload, increment, + num_redundancy_expert, info.placement_table[layer]) + # print(layer, f"Imbalance Ratio after Swap Adjustment:", self.safe_divide(new_max_workload, ave_workload)) + + for device_id in range(num_npus): + com_between_devices[device_id] = { + key: value + for key, value in com_between_devices[device_id].items() + } + sum_num += self.count_elements(com_between_devices[device_id]) + + max_heat_per_layer_after[layer] = max( + result, key=lambda x: x['total_load'])['total_load'] + + layer_changed_ratio = [] + for layer_idx in range(layer_num): + layer_changed_ratio.append( + self.safe_divide(max_heat_per_layer_after[layer_idx], + max_heat_per_layer_before[layer_idx])) + + per_layer_priority = np.argsort(layer_changed_ratio) + npu_heat_all_after = sum(max_heat_per_layer_after) + + change = 0 + if npu_heat_all_after < 0.95 * npu_heat_all_origin: + change = 1 + + new_global_deployment = self.constraint_expert_local_exchange( + current_expert_table, global_deployment) + + return change, per_layer_priority, np.array( + new_global_deployment).tolist() diff --git a/vllm_ascend/eplb/core/policy/policy_factory.py b/vllm_ascend/eplb/core/policy/policy_factory.py new file mode 100644 index 0000000..03a57e3 --- /dev/null +++ b/vllm_ascend/eplb/core/policy/policy_factory.py @@ -0,0 +1,26 @@ +# Copyright Huawei Technologies Co., Ltd. 2023-2024. All rights reserved. +# Todo: Once https://github.com/vllm-project/vllm/pull/24069 is merged in vllm. Remove this factory. +from .policy_abstract import DynamicConfig, EplbPolicy +from .policy_dynamic_ep import DynamicEplb +from .policy_dynamic_ep_v2 import DynamicEplbV2 +from .policy_random import RandomLoadBalance + + +class PolicyFactory: + + @staticmethod + def generate_policy(policy_type: int, config: DynamicConfig) -> EplbPolicy: + policy = { + # Constraint applying Dynamic EPLB policy V2: + # If there exists redundant expert: + # only one redundant expert can be placed in one NPU and its physical expert index must be 0 + + # Applying greedy d2d expert weight update composing + 0: + RandomLoadBalance, # RandomLoadBalance: shuffle last physical expert on NPU 1 and 3 + 1: + DynamicEplb, # Dynamic EPLB policy: overall expert replacement based on current moe load + 2: + DynamicEplbV2, # Dynamic EPLB policy V2: expert replacement with constrained number of expert shuffle + } + return policy.get(policy_type, RandomLoadBalance)(config) diff --git a/vllm_ascend/eplb/core/policy/policy_random.py b/vllm_ascend/eplb/core/policy/policy_random.py new file mode 100644 index 0000000..558d653 --- /dev/null +++ b/vllm_ascend/eplb/core/policy/policy_random.py @@ -0,0 +1,30 @@ +# Copyright # Copyright Huawei Technologies Co., Ltd. 2023-2024. All rights reserved. +# Todo: Once https://github.com/vllm-project/vllm/pull/24069 is merged in vllm. Remove this policy. +import copy +import random + +from .policy_abstract import DynamicConfig, EplbPolicy + +random.seed(42) + + +class RandomLoadBalance(EplbPolicy): + + def __init__(self, config: DynamicConfig): + super().__init__(config) + + def rebalance_experts(self, current_expert_table, expert_workload): + new_table = copy.deepcopy(current_expert_table) + num_layers = len(current_expert_table) + + for i in range(num_layers): + # randomly choose two card + # indices = random.sample(range(num_card), 2) + indices = [3, 1] + + # swap redundant experts + expert_id_to_exchange = new_table[i][indices[0]][-1].clone() + new_table[i][indices[0]][-1] = new_table[i][indices[1]][-1] + new_table[i][indices[1]][-1] = expert_id_to_exchange + + return 1, [-i for i in range(num_layers)], new_table diff --git a/vllm_ascend/eplb/eplb_updator.py b/vllm_ascend/eplb/eplb_updator.py new file mode 100644 index 0000000..1f25f8f --- /dev/null +++ b/vllm_ascend/eplb/eplb_updator.py @@ -0,0 +1,205 @@ +# +# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# This file is a part of the vllm-ascend project. +# +# Todo: Once https://github.com/vllm-project/vllm/issues/22246 is merged in vllm. Remove this updator. +import numpy +import torch +import torch.distributed as dist +import vllm.envs as envs +from vllm.logger import logger + +from vllm_ascend.eplb.core.eplb_worker import EplbProcess + + +class EplbUpdator: + + def __init__(self, ascend_config, loader, eplb_process: EplbProcess, + process): + self.ascend_config = ascend_config + self.init_eplb(self.ascend_config.expert_map_path, process) + self.eplb_loader = loader + self.eplb_process = eplb_process + self.shared_dict = self.eplb_process.shared_dict + + def set_adaptor(self, adaptor): + self.adaptor = adaptor + self.num_moe_layers = self.adaptor.num_moe_layers + self.global_expert_num = self.adaptor.global_expert_num + + def init_eplb(self, expert_map_path, process): + self.rank_id = dist.get_rank() + self.num_expert_load_gather = 10 + self.periodic_load_gather = True + self.num_iterations_eplb_update: torch.int64 = self.ascend_config.num_iterations_eplb_update + self.expert_map_path = expert_map_path + self.expert_map_record_path = self.ascend_config.expert_map_record_path + + try: + if not envs.VLLM_ALLOW_EXPERT_LOAD_COLLECTING: + self.num_expert_load_gather = self.num_iterations_eplb_update + self.periodic_load_gather = False + except Exception: + self.num_expert_load_gather = self.num_iterations_eplb_update + self.periodic_load_gather = False + + self.expert_map_initialized = False + self.gate_eplb = self.ascend_config.gate_eplb + + self.reqs = [] + self.update_info_all = [] + + self.cur_iterations: torch.int64 = 0 + + self.num_wait_worker_iterations: torch.int64 = self.ascend_config.num_wait_worker_iterations + + self.process = process + + logger.info( + f"[ModelRunner] Launched EPLB process (pid={self.process.pid})") + + def update_iteration(self): + self.cur_iterations += 1 + if self.cur_iterations == (self.num_iterations_eplb_update + \ + self.num_wait_worker_iterations + self.num_moe_layers): + if self.expert_map_record_path is not None: + self.adaptor._export_tensor_to_file( + self.shared_dict["expert_maps"], + self.expert_map_record_path) + + self.adaptor.model.clear_all_moe_loads() + if not self.gate_eplb: + self.cur_iterations = 0 + + def get_update_info_flag(self): + return self.cur_iterations == (self.num_iterations_eplb_update + + self.num_wait_worker_iterations - 1) + + def wakeup_eplb_worker_flag(self): + return self.cur_iterations == (self.num_iterations_eplb_update - 1) + + def update_expert_weight_flag(self): + weight_update_counter = self.cur_iterations - ( + self.num_iterations_eplb_update + self.num_wait_worker_iterations) + return (weight_update_counter >= 0 + and weight_update_counter < self.num_moe_layers) + + def get_init_expert_map(self): + try: + if not self.expert_map_initialized: + self.shared_dict[ + "expert_maps"] = self.adaptor.get_init_expert_map_from_file( + self.num_moe_layers, self.expert_map_path) + self.expert_map_initialized = True + except Exception as e: + logger.warning(f"[ModelRunner] Failed to wake EPLB process: {e}", + exc_info=True) + + def wakeup_eplb_worker(self): + self.eplb_process.planner_q.put(1) + + def forward_before(self): + if self.update_expert_weight_flag(): + (expert_send_info, expert_recv_info, updated_expert_map, + log2phy_map, layer_id) = self.update_info_all.pop(0) + log2phy_map_this_rank = torch.from_numpy(numpy.array(log2phy_map)) + self.eplb_loader.set_log2phy_map(log2phy_map_this_rank) + updated_expert_map_this_rank = torch.from_numpy( + numpy.array(updated_expert_map)) + self.eplb_loader.generate_expert_d2d_transfer_task( + expert_send_info, expert_recv_info, + updated_expert_map_this_rank, + layer_id + self.adaptor.num_dense_layers) + + # set asynchronous stream for d2d expert weight update + self.reqs = [] + self.eplb_loader.asyn_expert_weight_transfer(self.reqs) + + def take_update_info_from_eplb_process(self): + # Batch after eplb process being triggered, get update info provided by eplb process + if self.get_update_info_flag(): + self.update_info_all = self.eplb_process.block_update_q.get() + + def forward_end(self): + if self.wakeup_eplb_worker_flag(): + self.compute_and_set_moe_load(is_clear=True) + self.wakeup_eplb_worker() + + if self.update_expert_weight_flag(): + self.eplb_loader.update_expert_map_and_weight(self.reqs) + + self.update_iteration() + + def compute_and_set_moe_load(self, is_clear=False): + local_load = self.adaptor.get_rank_expert_workload() + + self._gather_buffer = None + if dist.is_initialized(): + self.world_size = dist.get_world_size() + self.device = local_load.device + if self._gather_buffer is None: + shape = (self.world_size, *local_load.shape) + self._gather_buffer = torch.empty(shape, + dtype=local_load.dtype, + device=self.device) + + dist.all_gather_into_tensor(self._gather_buffer, local_load) + + moe_load = self._gather_buffer.permute(1, 0, 2) + self.shared_dict["moe_load"] = moe_load.cpu() + logger.debug( + f"[ModelRunner] Updated shared_dict['moe_load'] shape={moe_load.shape}" + ) + else: + moe_load = local_load.unsqueeze(1) + self.shared_dict["moe_load"] = moe_load.cpu() + logger.debug( + f"[ModelRunner] Updated shared_dict['moe_load'] shape={moe_load.shape}" + ) + return moe_load + + def warm_up_eplb(self): + + self.get_init_expert_map() + self.compute_and_set_moe_load() + + src_tensor = torch.empty((1, ), device=self.device) + self_rank = dist.get_rank() + + comm_op_list = [] + + for dst_rank in range(self.world_size): + if dst_rank == self_rank: + continue + comm_op_list.append(dist.P2POp(dist.isend, src_tensor, dst_rank)) + + for src_rank in range(self.world_size): + if src_rank == self_rank: + continue + comm_op_list.append(dist.P2POp(dist.irecv, src_tensor, src_rank)) + if comm_op_list: + reqs = dist.batch_isend_irecv(comm_op_list) + + for req in reqs: + req.wait() + + def shutdown(self): + """ + Clean up the EPLB process. + """ + if self.process.is_alive(): + self.process.terminate() + self.process.join() + logger.info("[ModelRunner] EPLB process terminated") diff --git a/vllm_ascend/eplb/utils.py b/vllm_ascend/eplb/utils.py new file mode 100644 index 0000000..71b4487 --- /dev/null +++ b/vllm_ascend/eplb/utils.py @@ -0,0 +1,77 @@ +# +# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# This file is a part of the vllm-ascend project. +# +# Todo: Once https://github.com/vllm-project/vllm/pull/23553 is merged in vllm. Remove this model register. +import types + +import torch + + +def get_expert_map(self, layer_id): + return self.model.layers[layer_id].mlp.experts.get_map() + + +def get_log2phy_map(self, layer_id): + return self.model.layers[layer_id].mlp.experts.get_log2phy_map() + + +def get_all_expert_map(self, num_moe_layers): + all_loads = [] + num_dense_layers = self.num_dense_layers if hasattr( + self, "num_dense_layers") else 0 + for layer_id in range(num_moe_layers): + load_tensor = self.get_expert_map( + layer_id + num_dense_layers) # (num_experts_per_layer,) + all_loads.append(load_tensor) + + return torch.stack(all_loads, dim=0) + + +def get_all_moe_loads(self): + num_dense_layers = self.num_dense_layers if hasattr( + self, "num_dense_layers") else 0 + all_moe_loads = torch.stack( + [self.model.layers[layer_id + num_dense_layers].mlp.experts.moe_load \ + for layer_id in range(self.num_moe_layers)], + dim=0 + ) + return all_moe_loads + + +def clear_all_moe_loads(self): + num_dense_layers = self.num_dense_layers if hasattr( + self, "num_dense_layers") else 0 + for layer_id in range(self.num_moe_layers): + self.model.layers[layer_id + + num_dense_layers].mlp.experts.clear_moe_load() + + +def model_register(model, model_config): + model.get_expert_map = types.MethodType(get_expert_map, model) + model.get_log2phy_map = types.MethodType(get_log2phy_map, model) + model.get_all_expert_map = types.MethodType(get_all_expert_map, model) + model.get_all_moe_loads = types.MethodType(get_all_moe_loads, model) + model.clear_all_moe_loads = types.MethodType(clear_all_moe_loads, model) + + config = model_config.hf_config + + if config.model_type == "qwen3_moe": + model.num_moe_layers = config.num_hidden_layers + elif config.model_type == "deepseek_v2" or config.model_type == "deepseek_v3": + num_dense_layers = config.first_k_dense_replace + model.num_moe_layers = config.num_hidden_layers - num_dense_layers + else: + raise NotImplementedError("EPLB is not supported.") diff --git a/vllm_ascend/ops/common_fused_moe.py b/vllm_ascend/ops/common_fused_moe.py index 930549a..8c38627 100644 --- a/vllm_ascend/ops/common_fused_moe.py +++ b/vllm_ascend/ops/common_fused_moe.py @@ -14,7 +14,7 @@ # See the License for the specific language governing permissions and # limitations under the License. # - +import os.path from typing import Callable, Optional import torch @@ -26,10 +26,13 @@ from vllm.forward_context import get_forward_context from vllm.model_executor.layers.fused_moe.config import \ FusedMoEParallelConfig # isort: skip from vllm.model_executor.layers.fused_moe.layer import ( - FusedMoE, UnquantizedFusedMoEMethod) + FusedMoE, UnquantizedFusedMoEMethod, determine_expert_map) from vllm_ascend.ascend_config import get_ascend_config from vllm_ascend.distributed.parallel_state import get_mc2_group +from vllm_ascend.eplb.core.eplb_utils import (determine_default_expert_map, + determine_default_log2phy_map) +from vllm_ascend.ops.expert_load_balancer import ExpertLoadBalancer from vllm_ascend.ops.moe.experts_selector import select_experts from vllm_ascend.ops.moe.moe_comm_method import (AllGatherCommImpl, AlltoAllCommImpl, MC2CommImpl, @@ -226,14 +229,52 @@ def process_weights_after_loading(self, layer): class AscendFusedMoE(FusedMoE): + moe_counter = -1 def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) + AscendFusedMoE.moe_counter += 1 + self.moe_instance_id = AscendFusedMoE.moe_counter self.moe_config.tp_group = get_tp_group() self.moe_config.dp_group = get_dp_group() self.moe_config.ep_group = get_ep_group() self.moe_config.mc2_group = get_mc2_group() + ascend_config = get_ascend_config() + self.dynamic_eplb = ascend_config.dynamic_eplb + self.expert_map_path = ascend_config.expert_map_path + self.global_redundant_expert_num = ascend_config.init_redundancy_expert + # static eplb initializing with expert_map_path + if self.expert_map_path and os.path.exists( + self.expert_map_path) and os.access(self.expert_map_path, + os.R_OK): + self.expert_load_balancer = ExpertLoadBalancer( + self.expert_map_path, self.global_num_experts) + self.local_num_experts, self.expert_map = ( + self.expert_load_balancer.get_rank_placement_map( + self.moe_instance_id, self.ep_rank)) + self.log2phy = self.expert_load_balancer.get_rank_log2phy_map( + self.moe_instance_id, self.ep_rank).npu() + self.global_redundant_expert_num = ( + self.expert_load_balancer.get_global_redundant_expert_num()) + else: + # init moe. + self.local_num_experts, self.expert_map = determine_expert_map( + self.ep_size, self.ep_rank, self.global_num_experts) + # dynamic eplb initializing with not expert_map_path + if self.dynamic_eplb: + self.global_redundant_expert_num = ascend_config.init_redundancy_expert + self.local_num_experts, self.expert_map = determine_default_expert_map( + self.global_num_experts, self.ep_size, self.ep_rank, + self.global_redundant_expert_num) + self.log2phy = determine_default_log2phy_map( + self.global_num_experts, self.ep_size, self.ep_rank, + self.global_redundant_expert_num) + local_num_experts = (torch.sum( + self.expert_map != -1) if self.expert_map is not None else + self.global_num_experts) + if self.dynamic_eplb: + self.moe_load = torch.zeros(local_num_experts, dtype=torch.int64) for method in { AllGatherCommImpl, AlltoAllCommImpl, MC2CommImpl, @@ -243,6 +284,19 @@ class AscendFusedMoE(FusedMoE): self, method.__name__.lower(), method(moe_config=self.moe_config)) # type: ignore[abstract] + def update_expert_map(self, new_expert_map): + self.expert_map = new_expert_map + + def get_map(self): + return self.expert_map + + def get_log2phy_map(self): + return self.logical_to_physical_map + + def clear_moe_load(self): + if self.moe_load is not None: + self.moe_load.zero_() + def maybe_all_reduce_tensor_model_parallel( self, final_hidden_states: torch.Tensor): """NOTE(Yizhou): This is to override the parent class method. In `mc2commimpl`, @@ -292,6 +346,12 @@ class AscendFusedMoE(FusedMoE): logical_to_physical_map=self.logical_to_physical_map, logical_replica_count=self.logical_replica_count, ) + if isinstance(final_hidden_states, tuple): + final_hidden_states, group_list_type, expert_tokens = final_hidden_states + + if self.dynamic_eplb: + self.moe_load += expert_tokens if group_list_type else \ + torch.cat([expert_tokens[:1], expert_tokens[1:] - expert_tokens[:-1]]) final_hidden_states = forward_context.moe_comm_method.finalize( hidden_states=final_hidden_states, diff --git a/vllm_ascend/ops/fused_moe.py b/vllm_ascend/ops/fused_moe.py index 76b677a..a2b5915 100644 --- a/vllm_ascend/ops/fused_moe.py +++ b/vllm_ascend/ops/fused_moe.py @@ -37,6 +37,8 @@ from vllm.model_executor.layers.quantization.base_config import \ from vllm_ascend.ascend_config import get_ascend_config from vllm_ascend.distributed.parallel_state import get_mc2_group +from vllm_ascend.eplb.core.eplb_utils import (determine_default_expert_map, + determine_default_log2phy_map) from vllm_ascend.ops.expert_load_balancer import ExpertLoadBalancer from vllm_ascend.ops.moe.experts_selector import select_experts from vllm_ascend.ops.moe.moe_comm_method import (AllGatherCommImpl, @@ -58,6 +60,7 @@ class AscendUnquantizedFusedMoEMethod(UnquantizedFusedMoEMethod): self.global_batch_size = vllm_config.scheduler_config.max_num_seqs self.max_model_len = vllm_config.model_config.max_model_len get_ascend_config() + self.dynamic_eplb = get_ascend_config().dynamic_eplb try: device_group = get_mc2_group().device_group @@ -136,7 +139,8 @@ class AscendUnquantizedFusedMoEMethod(UnquantizedFusedMoEMethod): global_num_experts=global_num_experts, expert_map=expert_map, shared_experts=shared_experts, - need_trans=True) + need_trans=True, + dynamic_eplb=self.dynamic_eplb) class AscendFusedMoE(FusedMoE): @@ -234,25 +238,40 @@ class AscendFusedMoE(FusedMoE): self.moe_parallel_config.ep_size, is_deepseek_v3_r1) ascend_config = get_ascend_config() - expert_map_path = ascend_config.expert_map_path - if expert_map_path and os.path.exists(expert_map_path): - # moe expert load balance - expert_load_balancer = ExpertLoadBalancer(expert_map_path, - self.global_num_experts) - self.local_num_experts, self.expert_map = \ - expert_load_balancer.get_rank_placement_map( - self.moe_instance_id, - get_ep_group().rank_in_group) - self.log2phy = expert_load_balancer.get_rank_log2phy_map( - self.moe_instance_id, - get_ep_group().rank_in_group) - self.global_redundant_expert_num = \ - expert_load_balancer.get_global_redundant_expert_num() + self.dynamic_eplb = ascend_config.dynamic_eplb + self.expert_map_path = ascend_config.expert_map_path + self.global_redundant_expert_num = ascend_config.init_redundancy_expert + self.global_num_experts = num_experts + self.global_redundant_expert_num + # static eplb initializing with expert_map_path + if self.expert_map_path and os.path.exists( + self.expert_map_path) and os.access(self.expert_map_path, + os.R_OK): + self.expert_load_balancer = ExpertLoadBalancer( + self.expert_map_path, self.global_num_experts) + self.local_num_experts, self.expert_map = ( + self.expert_load_balancer.get_rank_placement_map( + self.moe_instance_id, self.ep_rank)) + self.log2phy = self.expert_load_balancer.get_rank_log2phy_map( + self.moe_instance_id, self.ep_rank).npu() + self.global_redundant_expert_num = ( + self.expert_load_balancer.get_global_redundant_expert_num()) else: - # Create a tensor of size num_experts filled with -1 + # init moe. self.local_num_experts, self.expert_map = determine_expert_map( - self.ep_size, - get_ep_group().rank_in_group, self.global_num_experts) + self.ep_size, self.ep_rank, self.global_num_experts) + # dynamic eplb initializing with not expert_map_path + if self.dynamic_eplb: + self.global_redundant_expert_num = ascend_config.init_redundancy_expert + self.local_num_experts, self.expert_map = determine_default_expert_map( + self.global_num_experts, self.ep_size, self.ep_rank, + self.global_redundant_expert_num) + self.log2phy = determine_default_log2phy_map( + self.global_num_experts, self.ep_size, self.ep_rank, + self.global_redundant_expert_num) + local_num_experts = (torch.sum(self.expert_map != -1) + if self.expert_map is not None else num_experts) + if self.dynamic_eplb: + self.moe_load = torch.zeros(local_num_experts, dtype=torch.int64) self.enable_shared_expert_dp = ascend_config.enable_shared_expert_dp @@ -281,6 +300,11 @@ class AscendFusedMoE(FusedMoE): local_num_experts = torch.sum(self.expert_map != -1) \ if self.expert_map is not None else num_experts + self.moe_load = None + + if self.dynamic_eplb: + self.moe_load = torch.zeros(local_num_experts, dtype=torch.int64) + moe_quant_params = { "num_experts": local_num_experts, "hidden_size": hidden_size, @@ -313,6 +337,19 @@ class AscendFusedMoE(FusedMoE): self, method.__name__.lower(), method(moe_config=self.moe_config)) # type: ignore[abstract] + def update_expert_map(self, new_expert_map): + self.expert_map = new_expert_map + + def get_map(self): + return self.expert_map + + def get_log2phy_map(self): + return self.logical_to_physical_map + + def clear_moe_load(self): + if self.moe_load is not None: + self.moe_load.zero_() + def naive_multicast(self, x: torch.Tensor, cu_tokens_across_dp_cpu: torch.Tensor): assert (len(x.shape) == 2) @@ -401,10 +438,20 @@ class AscendFusedMoE(FusedMoE): dynamic_scale_for_share=dynamic_scale_for_share, ) + group_list_type = None + if shared_experts: - if isinstance(e_hidden_states, tuple): + if isinstance(e_hidden_states, + tuple) and len(e_hidden_states) == 2: e_hidden_states, shared_hidden_states = e_hidden_states + if isinstance(e_hidden_states, tuple) and len(e_hidden_states) == 3: + e_hidden_states, group_list_type, expert_tokens = e_hidden_states + + if self.dynamic_eplb and group_list_type is not None: + self.moe_load += expert_tokens if group_list_type else \ + torch.cat([expert_tokens[:1], expert_tokens[1:] - expert_tokens[:-1]]) + final_hidden_states = forward_context.moe_comm_method.finalize( hidden_states=e_hidden_states, reduce_results=(not self.all_reduce_merge)) diff --git a/vllm_ascend/ops/moe/moe_comm_method.py b/vllm_ascend/ops/moe/moe_comm_method.py index 2194f4f..e4082ba 100644 --- a/vllm_ascend/ops/moe/moe_comm_method.py +++ b/vllm_ascend/ops/moe/moe_comm_method.py @@ -88,7 +88,8 @@ class MoECommMethod(ABC): # For load balance log2phy: torch.Tensor = None, global_redundant_expert_num: int = 0, - need_trans: bool = False) -> torch.Tensor: + need_trans: bool = False, + dynamic_eplb: bool = False): # Check constraints assert hidden_states.dtype in [ torch.float32, torch.float16, torch.bfloat16 @@ -133,6 +134,9 @@ class MoECommMethod(ABC): final_hidden_states = self.token_dispatcher.token_combine( hidden_states=mlp_output) + if dynamic_eplb: + return (final_hidden_states, group_list_type, expert_tokens) + return final_hidden_states @abstractmethod diff --git a/vllm_ascend/quantization/w4a8_dynamic.py b/vllm_ascend/quantization/w4a8_dynamic.py index 0de60b7..514bea7 100644 --- a/vllm_ascend/quantization/w4a8_dynamic.py +++ b/vllm_ascend/quantization/w4a8_dynamic.py @@ -24,6 +24,7 @@ from vllm.config import get_current_vllm_config from vllm.distributed import get_ep_group from vllm.forward_context import get_forward_context +from vllm_ascend.ascend_config import get_ascend_config from vllm_ascend.distributed.parallel_state import get_mc2_group from vllm_ascend.ops.moe.experts_selector import select_experts @@ -136,6 +137,7 @@ class AscendW4A8DynamicFusedMoEMethod: # NOTE: new quantize weights: 2 int4 pack into int8 self.new_quant_version = quant_version == "1.0.0" self.tp_size = 1 if vllm_config.parallel_config.enable_expert_parallel else self.ep_group.world_size + self.dynamic_eplb = get_ascend_config().dynamic_eplb if self.new_quant_version and self.tp_size > 16: raise ValueError( "The current weight does not support moe part tp>16.") @@ -299,7 +301,8 @@ class AscendW4A8DynamicFusedMoEMethod: global_redundant_expert_num=global_redundant_expert_num, shared_experts=shared_experts, quantized_x_for_share=quantized_x_for_share, - dynamic_scale_for_share=dynamic_scale_for_share) + dynamic_scale_for_share=dynamic_scale_for_share, + dynamic_eplb=self.dynamic_eplb) def process_scale(self, weight: torch.Tensor, scale, per_group_scale): group_num, k, n = weight.shape diff --git a/vllm_ascend/quantization/w8a8_dynamic.py b/vllm_ascend/quantization/w8a8_dynamic.py index c34140f..ab4987f 100644 --- a/vllm_ascend/quantization/w8a8_dynamic.py +++ b/vllm_ascend/quantization/w8a8_dynamic.py @@ -123,6 +123,7 @@ class AscendW8A8DynamicFusedMoEMethod: vllm_config.compilation_config.level == CompilationLevel.PIECEWISE and not vllm_config.model_config.enforce_eager and not ascend_config.torchair_graph_config.enabled) + self.dynamic_eplb = ascend_config.dynamic_eplb try: device_group = get_mc2_group().device_group @@ -229,7 +230,7 @@ class AscendW8A8DynamicFusedMoEMethod: w1_scale=layer.w13_weight_scale, w2_scale=layer.w2_weight_scale, expert_map=expert_map, - ) + dynamic_eplb=self.dynamic_eplb) # this is a naive implementation for experts load balance so as # to avoid accumulating too much tokens on a single rank. @@ -255,7 +256,8 @@ class AscendW8A8DynamicFusedMoEMethod: global_redundant_expert_num=global_redundant_expert_num, shared_experts=shared_experts, quantized_x_for_share=quantized_x_for_share, - dynamic_scale_for_share=dynamic_scale_for_share) + dynamic_scale_for_share=dynamic_scale_for_share, + dynamic_eplb=self.dynamic_eplb) def process_weights_after_loading(self, layer): if self.transpose_weight: diff --git a/vllm_ascend/torchair/models/torchair_deepseek_v2.py b/vllm_ascend/torchair/models/torchair_deepseek_v2.py index ec48b56..845793d 100644 --- a/vllm_ascend/torchair/models/torchair_deepseek_v2.py +++ b/vllm_ascend/torchair/models/torchair_deepseek_v2.py @@ -928,6 +928,8 @@ class TorchairDeepseekV2ForCausalLM(DeepseekV2ForCausalLM): config = vllm_config.model_config.hf_config quant_config = vllm_config.quant_config self.config = config + self.num_dense_layers = self.config.first_k_dense_replace + self.num_moe_layers = self.config.num_hidden_layers - self.num_dense_layers self.quant_config = quant_config self.model = TorchairDeepseekV2Model(vllm_config=vllm_config, prefix=maybe_prefix( diff --git a/vllm_ascend/torchair/ops/torchair_fused_moe.py b/vllm_ascend/torchair/ops/torchair_fused_moe.py index 1bab215..2221130 100644 --- a/vllm_ascend/torchair/ops/torchair_fused_moe.py +++ b/vllm_ascend/torchair/ops/torchair_fused_moe.py @@ -41,6 +41,8 @@ from vllm.model_executor.layers.quantization.base_config import \ from vllm_ascend.ascend_config import get_ascend_config from vllm_ascend.ascend_forward_context import FusedMoEState from vllm_ascend.distributed.parallel_state import get_mc2_group +from vllm_ascend.eplb.core.eplb_utils import (determine_default_expert_map, + determine_default_log2phy_map) from vllm_ascend.ops.expert_load_balancer import ExpertLoadBalancer from vllm_ascend.ops.sequence_parallel import MetadataForPadding from vllm_ascend.quantization.quant_config import AscendFusedMoEMethod @@ -1011,25 +1013,40 @@ class TorchairAscendFusedMoE(FusedMoE): self.moe_parallel_config.ep_size, is_deepseek_v3_r1) ascend_config = get_ascend_config() - expert_map_path = ascend_config.expert_map_path - if expert_map_path and os.path.exists(expert_map_path): - # moe expert load balance - expert_load_balancer = ExpertLoadBalancer(expert_map_path, - self.global_num_experts) - self.local_num_experts, self.expert_map = \ - expert_load_balancer.get_rank_placement_map( - self.moe_instance_id, - get_ep_group().rank_in_group) - self.log2phy = expert_load_balancer.get_rank_log2phy_map( - self.moe_instance_id, - get_ep_group().rank_in_group) - self.global_redundant_expert_num = \ - expert_load_balancer.get_global_redundant_expert_num() + self.dynamic_eplb = ascend_config.dynamic_eplb + self.expert_map_path = ascend_config.expert_map_path + self.global_redundant_expert_num = ascend_config.init_redundancy_expert + self.global_num_experts = num_experts + self.global_redundant_expert_num + # static eplb initializing with expert_map_path + if self.expert_map_path and os.path.exists( + self.expert_map_path) and os.access(self.expert_map_path, + os.R_OK): + self.expert_load_balancer = ExpertLoadBalancer( + self.expert_map_path, self.global_num_experts) + self.local_num_experts, self.expert_map = ( + self.expert_load_balancer.get_rank_placement_map( + self.moe_instance_id, self.ep_rank)) + self.log2phy = self.expert_load_balancer.get_rank_log2phy_map( + self.moe_instance_id, self.ep_rank).npu() + self.global_redundant_expert_num = ( + self.expert_load_balancer.get_global_redundant_expert_num()) else: - # Create a tensor of size num_experts filled with -1 + # init moe. self.local_num_experts, self.expert_map = determine_expert_map( - self.ep_size, - get_ep_group().rank_in_group, self.global_num_experts) + self.ep_size, self.ep_rank, self.global_num_experts) + # dynamic eplb initializing with not expert_map_path + if self.dynamic_eplb: + self.global_redundant_expert_num = ascend_config.init_redundancy_expert + self.local_num_experts, self.expert_map = determine_default_expert_map( + self.global_num_experts, self.ep_size, self.ep_rank, + self.global_redundant_expert_num) + self.log2phy = determine_default_log2phy_map( + self.global_num_experts, self.ep_size, self.ep_rank, + self.global_redundant_expert_num) + local_num_experts = (torch.sum(self.expert_map != -1) + if self.expert_map is not None else num_experts) + if self.dynamic_eplb: + self.moe_load = torch.zeros(local_num_experts, dtype=torch.int64) self.torchair_graph_enabled = ascend_config.torchair_graph_config.enabled self.enable_multistream_moe = \ @@ -1064,8 +1081,11 @@ class TorchairAscendFusedMoE(FusedMoE): assert self.quant_method is not None - local_num_experts = torch.sum(self.expert_map != -1) \ - if self.expert_map is not None else num_experts + self.moe_load = None + local_num_experts = (torch.sum(self.expert_map != -1) + if self.expert_map is not None else num_experts) + if self.dynamic_eplb: + self.moe_load = torch.zeros(local_num_experts, dtype=torch.int64) moe_quant_params = { "num_experts": local_num_experts, @@ -1244,6 +1264,11 @@ class TorchairAscendFusedMoE(FusedMoE): if isinstance(e_hidden_states, tuple): e_hidden_states, shared_hidden_states = e_hidden_states + if self.dynamic_eplb and isinstance( + e_hidden_states, tuple) and len(e_hidden_states) == 3: + self.moe_load += e_hidden_states[2] if e_hidden_states[1] == 0 else \ + torch.cat(e_hidden_states[2][:1], e_hidden_states[2][1:] - e_hidden_states[2][:-1]) + if (fused_moe_state not in [ FusedMoEState.AllGather, FusedMoEState.AllGatherEP, FusedMoEState.NaiveMulticast @@ -1288,6 +1313,19 @@ class TorchairAscendFusedMoE(FusedMoE): else: return final_hidden_states + def update_expert_map(self, new_expert_map): + self.expert_map = new_expert_map + + def get_map(self): + return self.expert_map + + def get_log2phy_map(self): + return self.logical_to_physical_map + + def clear_moe_load(self): + if self.moe_load is not None: + self.moe_load.zero_() + # ----------------------------------------- TBO-related -------------------------------------------- def _forward_ms_fused_moe_comp( diff --git a/vllm_ascend/worker/model_runner_v1.py b/vllm_ascend/worker/model_runner_v1.py index c267879..f867e5a 100644 --- a/vllm_ascend/worker/model_runner_v1.py +++ b/vllm_ascend/worker/model_runner_v1.py @@ -26,6 +26,7 @@ from collections.abc import Iterator from contextlib import contextmanager, nullcontext from copy import deepcopy from dataclasses import dataclass +from multiprocessing import Manager from typing import TYPE_CHECKING, Any, Dict, List, Optional, Union, cast import numpy as np @@ -93,6 +94,12 @@ from vllm_ascend.attention.attention_mask import AttentionMaskBuilder from vllm_ascend.attention.attention_v1 import AscendAttentionState from vllm_ascend.attention.utils import AscendCommonAttentionMetadata from vllm_ascend.compilation.acl_graph import ACLGraphWrapper +from vllm_ascend.eplb.adaptor.vllm_adaptor import VllmEplbAdaptor +from vllm_ascend.eplb.core.eplb_device_transfer_loader import \ + D2DExpertWeightLoader +from vllm_ascend.eplb.core.eplb_worker import EplbProcess +from vllm_ascend.eplb.eplb_updator import EplbUpdator +from vllm_ascend.eplb.utils import model_register from vllm_ascend.models.layers.mla import AscendMultiHeadLatentAttention from vllm_ascend.multistream.ms_split import compute_split_seq_index from vllm_ascend.platform import NPUPlatform @@ -422,6 +429,23 @@ class NPUModelRunner(LoRAModelRunnerMixin): dtype=torch.bool, device=self.device, ) + self.dynamic_eplb = ascend_config.dynamic_eplb + if self.dynamic_eplb: + self.is_eplb_warmuped = False + self.eplb_loader = D2DExpertWeightLoader() + self.manager = Manager() + self.shared_dict = self.manager.dict({ + "expert_map": None, + "moe_load": None, + "expert_maps": None + }) + self.eplb_process = EplbProcess(shared_dict=self.shared_dict, + policy_type=1, + enable_d2d=True) + self.process = self.eplb_process._launch_process() + ascend_config = get_ascend_config() + self.eplb_updator = EplbUpdator(ascend_config, self.eplb_loader, + self.eplb_process, self.process) self.use_async_scheduling = self.scheduler_config.async_scheduling self.async_output_copy_stream = torch.npu.Stream() if \ @@ -1736,12 +1760,19 @@ class NPUModelRunner(LoRAModelRunnerMixin): # Return empty ModelRunnerOuptut if there's no work to do. return EMPTY_MODEL_RUNNER_OUTPUT return self.kv_connector_no_forward(scheduler_output) + + if self.dynamic_eplb: + self.eplb_updator.forward_before() + (attn_metadata, positions, num_scheduled_tokens_np, num_input_tokens, num_tokens_across_dp, maybe_padded_num_tokens, logits_indices, spec_decode_metadata, input_ids, inputs_embeds, intermediate_tensors) = (self._prepare_inputs( scheduler_output, intermediate_tensors)) + if self.dynamic_eplb: + self.eplb_updator.take_update_info_from_eplb_process() + moe_comm_method = self._select_moe_comm_method(num_input_tokens, self.with_prefill) @@ -2004,7 +2035,8 @@ class NPUModelRunner(LoRAModelRunnerMixin): captured_name = "Decode" if self.attn_state == AscendAttentionState.DecodeOnly else "Prefill" logger.info("Profile execute duration [%s]:%s", captured_name, " ".join(dr_str)) - + if self.dynamic_eplb: + self.eplb_updator.forward_end() if not self.use_async_scheduling: return model_runner_output @@ -2169,6 +2201,9 @@ class NPUModelRunner(LoRAModelRunnerMixin): num_reqs, skip_attn=True) + if not self.in_profile_run and self.dynamic_eplb: + self.eplb_updator.forward_before() + with self.maybe_dummy_run_with_lora(self.lora_config, num_scheduled_tokens): if self.is_multimodal_model: @@ -2251,6 +2286,11 @@ class NPUModelRunner(LoRAModelRunnerMixin): num_tokens_across_dp=num_tokens_across_dp) if need_dummy_logits: dummy_compute_logits(hidden_states) + if self.in_profile_run and self.dynamic_eplb: + self.model.clear_all_moe_loads() + if not self.in_profile_run and self.dynamic_eplb: + self.eplb_updator.take_update_info_from_eplb_process() + self.eplb_updator.forward_end() return hidden_states @contextmanager @@ -2357,12 +2397,21 @@ class NPUModelRunner(LoRAModelRunnerMixin): max_task = max(output_size.items(), key=lambda x: x[1])[0] return self._dummy_pooler_run_task(hidden_states, max_task) + def eplb_warmup(self): + if self.dynamic_eplb and not self.is_eplb_warmuped: + self.is_eplb_warmuped = True + self.eplb_adaptor = VllmEplbAdaptor(model=self.model) + self.eplb_loader.set_adator(self.eplb_adaptor) + self.eplb_updator.set_adaptor(self.eplb_adaptor) + self.eplb_updator.warm_up_eplb() + def load_model(self) -> None: logger.info("Starting to load model %s...", self.model_config.model) with DeviceMemoryProfiler() as m: # noqa: SIM117 self.model = get_model(vllm_config=self.vllm_config) - + if self.dynamic_eplb: + model_register(self.model, self.model_config) if is_310p(): from vllm.model_executor.layers.linear import ( MergedColumnParallelLinear, QKVParallelLinear, diff --git a/vllm_ascend/worker/worker_v1.py b/vllm_ascend/worker/worker_v1.py index 8af3d31..0a03e4a 100644 --- a/vllm_ascend/worker/worker_v1.py +++ b/vllm_ascend/worker/worker_v1.py @@ -250,6 +250,7 @@ class NPUWorker(WorkerBase): def compile_or_warm_up_model(self) -> None: # Note: need to adapt for graph mode. + self.model_runner.eplb_warmup() warmup_sizes = (self.vllm_config.compilation_config.compile_sizes or []).copy() if not self.model_config.enforce_eager: