[GB.XML]
* BUG: the parser now correctly supports UTF-8 * BUG: the parser now allows all special characters in tag names and attribute names, including the UTF-8 ones, as defined in http://www.w3.org/TR/REC-xml/#NT-NameChar git-svn-id: svn://localhost/gambas/trunk@4654 867c0c6c-44f3-4631-809d-bfa615b0a4ec
This commit is contained in:
parent
bda1b7d29d
commit
2cbff79d05
3 changed files with 126 additions and 12 deletions
|
@ -280,7 +280,7 @@ vector<Node*>* Element::fromText(wstring data, wstring::size_type i, uint c, uin
|
|||
|
||||
for(INC; i < data.length(); INC) //On cherche le tagName
|
||||
{
|
||||
if(!isLetter(s) && s != L":" && s != L"-"&& s != L"_") break;
|
||||
if(!isNameChar(s)) break;
|
||||
tag += s;
|
||||
}
|
||||
|
||||
|
@ -357,9 +357,9 @@ vector<Node*>* Element::fromText(wstring data, wstring::size_type i, uint c, uin
|
|||
|
||||
wstring attr, sVal = L"";
|
||||
|
||||
if(isLetter(s))
|
||||
if(isNameStartChar(s))
|
||||
{
|
||||
while(i < data.length() && (isLetter(s) || s == L"-" || s == L":" || s == L"_"))
|
||||
while(i < data.length() && isNameChar(s))
|
||||
{
|
||||
attr += s;
|
||||
INC;
|
||||
|
|
|
@ -9,18 +9,91 @@
|
|||
#include "CExplorer.h"
|
||||
#endif
|
||||
|
||||
std::string WStringToString(const std::wstring& s)
|
||||
{
|
||||
std::string temp(s.length(), ' ');
|
||||
std::copy(s.begin(), s.end(), temp.begin());
|
||||
return temp;
|
||||
|
||||
void utf8toWStr(wstring& dest, const string& src){
|
||||
dest.clear();
|
||||
wchar_t w = 0;
|
||||
int bytes = 0;
|
||||
wchar_t err = L'<EFBFBD>';
|
||||
for (size_t i = 0; i < src.size(); i++){
|
||||
unsigned char c = (unsigned char)src[i];
|
||||
if (c <= 0x7f){//first byte
|
||||
if (bytes){
|
||||
dest.push_back(err);
|
||||
bytes = 0;
|
||||
}
|
||||
dest.push_back((wchar_t)c);
|
||||
}
|
||||
else if (c <= 0xbf){//second/third/etc byte
|
||||
if (bytes){
|
||||
w = ((w << 6)|(c & 0x3f));
|
||||
bytes--;
|
||||
if (bytes == 0)
|
||||
dest.push_back(w);
|
||||
}
|
||||
else
|
||||
dest.push_back(err);
|
||||
}
|
||||
else if (c <= 0xdf){//2byte sequence start
|
||||
bytes = 1;
|
||||
w = c & 0x1f;
|
||||
}
|
||||
else if (c <= 0xef){//3byte sequence start
|
||||
bytes = 2;
|
||||
w = c & 0x0f;
|
||||
}
|
||||
else if (c <= 0xf7){//3byte sequence start
|
||||
bytes = 3;
|
||||
w = c & 0x07;
|
||||
}
|
||||
else{
|
||||
dest.push_back(err);
|
||||
bytes = 0;
|
||||
}
|
||||
}
|
||||
if (bytes)
|
||||
dest.push_back(err);
|
||||
}
|
||||
|
||||
std::wstring StringToWString(const std::string& s)
|
||||
void wstrToUtf8(string& dest, const wstring& src){
|
||||
dest.clear();
|
||||
for (size_t i = 0; i < src.size(); i++){
|
||||
wchar_t w = src[i];
|
||||
if (w <= 0x7f)
|
||||
dest.push_back((char)w);
|
||||
else if (w <= 0x7ff){
|
||||
dest.push_back(0xc0 | ((w >> 6)& 0x1f));
|
||||
dest.push_back(0x80| (w & 0x3f));
|
||||
}
|
||||
else if (w <= 0xffff){
|
||||
dest.push_back(0xe0 | ((w >> 12)& 0x0f));
|
||||
dest.push_back(0x80| ((w >> 6) & 0x3f));
|
||||
dest.push_back(0x80| (w & 0x3f));
|
||||
}
|
||||
else if (w <= 0x10ffff){
|
||||
dest.push_back(0xf0 | ((w >> 18)& 0x07));
|
||||
dest.push_back(0x80| ((w >> 12) & 0x3f));
|
||||
dest.push_back(0x80| ((w >> 6) & 0x3f));
|
||||
dest.push_back(0x80| (w & 0x3f));
|
||||
}
|
||||
else
|
||||
dest.push_back('?');
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
std::string WStringToString(const std::wstring& str)
|
||||
{
|
||||
std::wstring temp(s.length(),L' ');
|
||||
std::copy(s.begin(), s.end(), temp.begin());
|
||||
return temp;
|
||||
string result;
|
||||
wstrToUtf8(result, str);
|
||||
return result;
|
||||
}
|
||||
|
||||
std::wstring StringToWString(const std::string& str)
|
||||
{
|
||||
wstring result;
|
||||
utf8toWStr(result, str);
|
||||
return result;
|
||||
}
|
||||
|
||||
wstring Html$(wstring text)
|
||||
|
@ -69,6 +142,44 @@ ostream &operator<<( ostream &out, std::wstring *str )
|
|||
return out;
|
||||
}
|
||||
|
||||
/* http://www.w3.org/TR/REC-xml/#NT-NameStartChar
|
||||
|
||||
NameStartChar ::= ":" | [A-Z] | "_" | [a-z] | [#xC0-#xD6] | [#xD8-#xF6] | [#xF8-#x2FF] |
|
||||
[#x370-#x37D] | [#x37F-#x1FFF] | [#x200C-#x200D] | [#x2070-#x218F] |
|
||||
[#x2C00-#x2FEF] | [#x3001-#xD7FF] | [#xF900-#xFDCF] | [#xFDF0-#xFFFD] | [#x10000-#xEFFFF]
|
||||
*/
|
||||
|
||||
#define INTER(min, max) (car >= min && car <= max)
|
||||
#define INTERCAR(min, max) (car >= *(min) && car <= *(max))
|
||||
#define CAR(c) (car == *(c))
|
||||
|
||||
bool isNameStartChar(wstring &s)
|
||||
{
|
||||
const wchar_t car = (s.at(0));
|
||||
|
||||
return CAR(":") || INTERCAR("A", "Z") || CAR("_") || INTERCAR("a", "z") || CAR("Ø") ||
|
||||
INTER(0xC0, 0xD6) || INTER(0xD8, 0xF6) || INTER(0xF8, 0x2FF) ||
|
||||
INTER(0x370, 0x37D) || INTER(0x37F, 0x1FFF) || INTER(0x200C, 0x200D) ||
|
||||
INTER(0x2070, 0x218F) || INTER(0xC200, 0x2FEF) || INTER(0x3001, 0xD7FF) ||
|
||||
INTER(0xF900, 0xFDCF) || INTER(0xFDF0, 0xFFFD) || INTER(0x10000, 0xEFFFF);
|
||||
|
||||
|
||||
}
|
||||
|
||||
/* http://www.w3.org/TR/REC-xml/#NT-NameChar
|
||||
|
||||
NameChar ::= NameStartChar | "-" | "." | [0-9] | #xB7 | [#x0300-#x036F] | [#x203F-#x2040]
|
||||
|
||||
*/
|
||||
|
||||
bool isNameChar(wstring &s)
|
||||
{
|
||||
const wchar_t car = (s.at(0));
|
||||
|
||||
return isNameStartChar(s) || CAR("-") || CAR(".") || INTERCAR("0", "9") ||
|
||||
(car == 0xB7) || INTER(0x0300, 0x036F) || INTER(0x203F, 0x2040);
|
||||
}
|
||||
|
||||
#ifndef __HMAIN_CPP
|
||||
|
||||
GB_INTERFACE GB EXPORT;
|
||||
|
|
|
@ -64,6 +64,9 @@ extern "C" GB_INTERFACE GB;
|
|||
|
||||
#define VARGOBJ(type, arg) reinterpret_cast<type*>(VARG(arg))
|
||||
|
||||
bool isNameStartChar(wstring &s);
|
||||
bool isNameChar(wstring &s);
|
||||
|
||||
|
||||
|
||||
#endif // MAIN_H
|
||||
|
|
Loading…
Reference in a new issue