HTM = ( § § параграф © © знак охраны авторского права (copyright) ® ® символ зарегистрированного товарного знака ™ ™ символ товарного знака ° ° знак градуса « « левая кавычка (левая ёлочка) » » правая кавычка (правая ёлочка) … … многоточие ’ апостроф „ „ открывающая лапка “ “ закрывающая лапка “ “ открывающая английская лапка ” ” закрывающая английская лапка • • жирная точка – – короткое тире (см. одноименный § 158) − − минус ± ± плюс-минус — — тире № № знак номера ) ;!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! ;!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! t := UnHTM(UnSlashUnicode(HTM)) ; split long line to smaller lines about 40-50 symbols length ;t := RegExReplace(t,".{40,50}(\s)","$0`n") ;MsgBox, % t MsgBox, % ComUnHTML( HTM ) MsgBox, % UnHTM( HTM ) ComUnHTML(html) { oHTML := ComObjCreate("HtmlFile") oHTML.write(html) return oHTML.documentElement.innerText } UnHTM( HTM ) { ; Remove HTML formatting / Convert to ordinary text by SKAN 19-Nov-2009 Static HT ; Forum Topic: www.autohotkey.com/forum/topic51342.html IfEqual,HT,, SetEnv,HT, % "ááââ´´ææàà&ååãã&au" . "mlä&bdquo„¦¦&bull•ç縸¢¢&circˆ©©¤¤&dagger†&dagger‡°" . "°÷÷ééêêèèððëë&euro€&fnofƒ½½¼¼¾¾>>&h" . "ellip…ííîî¡¡ìì¿¿ïï««&ldquo“&lsaquo‹&lsquo‘<<&m" . "acr¯&mdash—µµ··  &ndash–¬¬ññóóôô&oeligœòò&or" . "dfªººøøõõöö¶¶&permil‰±±££"""»»&rdquo”®" . "®&rsaquo›&rsquo&sbquo‚&scaronš§§­¹¹²²³³ßßþþ&tilde˜&tim" . "es×&trade™úúûûùù¨¨üüýý¥¥ÿÿ" ;HTM := RegExReplace( HTM,"&(\w+;)", "&$1" ) ;!! для обработки &lt; ;HTM := RegExReplace( HTM,"&(#\d+;)", "&$1" ) ;!! для обработки &#60; TXT := RegExReplace( HTM,"<[^>]+>", " " ) ; Remove all tags between "<" and ">" Loop, Parse, TXT, &`; ; Create a list of special characters L := "&" A_LoopField ";", R .= (!(A_Index&1)) ? ( (!InStr(R,L,1)) ? L:"" ) : "" StringTrimRight, R, R, 1 Loop, Parse, R , `; ; Parse Special Characters If F := InStr( HT, A_LoopField ) ; Lookup HT Data StringReplace, TXT,TXT, %A_LoopField%`;, % SubStr( HT,F+StrLen(A_LoopField), 1 ), All Else If ( SubStr( A_LoopField,2,1)="#" ) StringReplace, TXT, TXT, %A_LoopField%`;, % Chr(SubStr(A_LoopField,3)), All ;TXT := RegExReplace(TXT, " +", " ") ;!! множественные пробелы на один Return RegExReplace( TXT, "(^\s*|\s*$)") ; Remove leading/trailing white spaces } ;--------------------------------------- ; Convert strings ;--------------------------------------- /* CP_ACP = 0 CP_OEMCP = 1 CP_MACCP = 2 CP_UTF7 = 65000 CP_UTF8 = 65001 */ Ansi2Oem(sString) { Ansi2Unicode(sString, wString, 0) Unicode2Ansi(wString, zString, 1) Return zString } Oem2Ansi(zString) { Ansi2Unicode(zString, wString, 1) Unicode2Ansi(wString, sString, 0) Return sString } Ansi2UTF8(sString) { Ansi2Unicode(sString, wString, 0) Unicode2Ansi(wString, zString, 65001) Return zString } UTF82Ansi(zString) { Ansi2Unicode(zString, wString, 65001) Unicode2Ansi(wString, sString, 0) Return sString } Ansi2Unicode(ByRef sString, ByRef wString, CP = 0) { nSize := DllCall("MultiByteToWideChar" , "Uint", CP , "Uint", 0 , "Uint", &sString , "int", -1 , "Uint", 0 , "int", 0) VarSetCapacity(wString, nSize * 2) DllCall("MultiByteToWideChar" , "Uint", CP , "Uint", 0 , "Uint", &sString , "int", -1 , "Uint", &wString , "int", nSize) } Unicode2Ansi(ByRef wString, ByRef sString, CP = 0) { nSize := DllCall("WideCharToMultiByte" , "Uint", CP , "Uint", 0 , "Uint", &wString , "int", -1 , "Uint", 0 , "int", 0 , "Uint", 0 , "Uint", 0) VarSetCapacity(sString, nSize) DllCall("WideCharToMultiByte" , "Uint", CP , "Uint", 0 , "Uint", &wString , "int", -1 , "str", sString , "int", nSize , "Uint", 0 , "Uint", 0) } ;------------------------------------------------- ; HTML encode/decode ;------------------------------------------------ UriEncode(str) { ; v 0.3 / (w) 24.06.2008 by derRaphael / zLib-Style release b_Format := A_FormatInteger data := "" SetFormat,Integer,H Loop,Parse,str if ((Asc(A_LoopField)>0x7f) || (Asc(A_LoopField)<0x30) || (asc(A_LoopField)=0x3d)) data .= "%" . ((StrLen(c:=SubStr(ASC(A_LoopField),3))<2) ? "0" . c : c) Else data .= A_LoopField SetFormat,Integer,%b_format% return data } UriDecode(str) { ; v 0.1 / (w) 28.06.2008 by derRaphael / zLib-Style release Loop,Parse,str,`% txt := (A_Index=1) ? A_LoopField : txt chr("0x" substr(A_LoopField,1,2)) SubStr(A_LoopField,3) return txt } UnSlashUnicode(s) { ; unslash unicode sequences like \u0026 ; by Mikhail Kuropyatnikov 2009 (micdelt@mail.ru) rx = \\u([0-9a-fA-F]{4}) pos = 0 loop { pos := RegExMatch(s,rx,m,pos+1) if (pos = 0) break StringReplace, s, s, %m%, % Chr("0x" . SubStr(m,3,4)) } return s } |