* BUG: the parser now correctly supports UTF-8
* BUG: the parser now allows all special characters in tag names and attribute names, including the UTF-8 ones, as defined in http://www.w3.org/TR/REC-xml/#NT-NameChar

git-svn-id: svn://localhost/gambas/trunk@4654 867c0c6c-44f3-4631-809d-bfa615b0a4ec
This commit is contained in:
Adrien Prokopowicz 2012-04-22 02:51:53 +00:00
parent bda1b7d29d
commit 2cbff79d05
3 changed files with 126 additions and 12 deletions

View file

@ -280,7 +280,7 @@ vector<Node*>* Element::fromText(wstring data, wstring::size_type i, uint c, uin
for(INC; i < data.length(); INC) //On cherche le tagName
{
if(!isLetter(s) && s != L":" && s != L"-"&& s != L"_") break;
if(!isNameChar(s)) break;
tag += s;
}
@ -357,9 +357,9 @@ vector<Node*>* Element::fromText(wstring data, wstring::size_type i, uint c, uin
wstring attr, sVal = L"";
if(isLetter(s))
if(isNameStartChar(s))
{
while(i < data.length() && (isLetter(s) || s == L"-" || s == L":" || s == L"_"))
while(i < data.length() && isNameChar(s))
{
attr += s;
INC;

View file

@ -9,18 +9,91 @@
#include "CExplorer.h"
#endif
std::string WStringToString(const std::wstring& s)
{
std::string temp(s.length(), ' ');
std::copy(s.begin(), s.end(), temp.begin());
return temp;
void utf8toWStr(wstring& dest, const string& src){
dest.clear();
wchar_t w = 0;
int bytes = 0;
wchar_t err = L'<EFBFBD>';
for (size_t i = 0; i < src.size(); i++){
unsigned char c = (unsigned char)src[i];
if (c <= 0x7f){//first byte
if (bytes){
dest.push_back(err);
bytes = 0;
}
dest.push_back((wchar_t)c);
}
else if (c <= 0xbf){//second/third/etc byte
if (bytes){
w = ((w << 6)|(c & 0x3f));
bytes--;
if (bytes == 0)
dest.push_back(w);
}
else
dest.push_back(err);
}
else if (c <= 0xdf){//2byte sequence start
bytes = 1;
w = c & 0x1f;
}
else if (c <= 0xef){//3byte sequence start
bytes = 2;
w = c & 0x0f;
}
else if (c <= 0xf7){//3byte sequence start
bytes = 3;
w = c & 0x07;
}
else{
dest.push_back(err);
bytes = 0;
}
}
if (bytes)
dest.push_back(err);
}
std::wstring StringToWString(const std::string& s)
void wstrToUtf8(string& dest, const wstring& src){
dest.clear();
for (size_t i = 0; i < src.size(); i++){
wchar_t w = src[i];
if (w <= 0x7f)
dest.push_back((char)w);
else if (w <= 0x7ff){
dest.push_back(0xc0 | ((w >> 6)& 0x1f));
dest.push_back(0x80| (w & 0x3f));
}
else if (w <= 0xffff){
dest.push_back(0xe0 | ((w >> 12)& 0x0f));
dest.push_back(0x80| ((w >> 6) & 0x3f));
dest.push_back(0x80| (w & 0x3f));
}
else if (w <= 0x10ffff){
dest.push_back(0xf0 | ((w >> 18)& 0x07));
dest.push_back(0x80| ((w >> 12) & 0x3f));
dest.push_back(0x80| ((w >> 6) & 0x3f));
dest.push_back(0x80| (w & 0x3f));
}
else
dest.push_back('?');
}
}
std::string WStringToString(const std::wstring& str)
{
std::wstring temp(s.length(),L' ');
std::copy(s.begin(), s.end(), temp.begin());
return temp;
string result;
wstrToUtf8(result, str);
return result;
}
std::wstring StringToWString(const std::string& str)
{
wstring result;
utf8toWStr(result, str);
return result;
}
wstring Html$(wstring text)
@ -69,6 +142,44 @@ ostream &operator<<( ostream &out, std::wstring *str )
return out;
}
/* http://www.w3.org/TR/REC-xml/#NT-NameStartChar
NameStartChar ::= ":" | [A-Z] | "_" | [a-z] | [#xC0-#xD6] | [#xD8-#xF6] | [#xF8-#x2FF] |
[#x370-#x37D] | [#x37F-#x1FFF] | [#x200C-#x200D] | [#x2070-#x218F] |
[#x2C00-#x2FEF] | [#x3001-#xD7FF] | [#xF900-#xFDCF] | [#xFDF0-#xFFFD] | [#x10000-#xEFFFF]
*/
#define INTER(min, max) (car >= min && car <= max)
#define INTERCAR(min, max) (car >= *(min) && car <= *(max))
#define CAR(c) (car == *(c))
bool isNameStartChar(wstring &s)
{
const wchar_t car = (s.at(0));
return CAR(":") || INTERCAR("A", "Z") || CAR("_") || INTERCAR("a", "z") || CAR("Ø") ||
INTER(0xC0, 0xD6) || INTER(0xD8, 0xF6) || INTER(0xF8, 0x2FF) ||
INTER(0x370, 0x37D) || INTER(0x37F, 0x1FFF) || INTER(0x200C, 0x200D) ||
INTER(0x2070, 0x218F) || INTER(0xC200, 0x2FEF) || INTER(0x3001, 0xD7FF) ||
INTER(0xF900, 0xFDCF) || INTER(0xFDF0, 0xFFFD) || INTER(0x10000, 0xEFFFF);
}
/* http://www.w3.org/TR/REC-xml/#NT-NameChar
NameChar ::= NameStartChar | "-" | "." | [0-9] | #xB7 | [#x0300-#x036F] | [#x203F-#x2040]
*/
bool isNameChar(wstring &s)
{
const wchar_t car = (s.at(0));
return isNameStartChar(s) || CAR("-") || CAR(".") || INTERCAR("0", "9") ||
(car == 0xB7) || INTER(0x0300, 0x036F) || INTER(0x203F, 0x2040);
}
#ifndef __HMAIN_CPP
GB_INTERFACE GB EXPORT;

View file

@ -64,6 +64,9 @@ extern "C" GB_INTERFACE GB;
#define VARGOBJ(type, arg) reinterpret_cast<type*>(VARG(arg))
bool isNameStartChar(wstring &s);
bool isNameChar(wstring &s);
#endif // MAIN_H