PhotoRec: distinguish .csv from .txt

This commit is contained in:
Christophe Grenier 2008-10-28 09:22:01 +01:00
parent af6535e0d7
commit 630a6ef2d6

View file

@ -522,7 +522,7 @@ static int header_check_txt(const unsigned char *buffer, const unsigned int buff
{ {
static char *buffer_lower=NULL; static char *buffer_lower=NULL;
static unsigned int buffer_lower_size=0; static unsigned int buffer_lower_size=0;
unsigned int i; unsigned int l=0;
const unsigned char header_asp[22] = "<%@ language=\"vbscript"; const unsigned char header_asp[22] = "<%@ language=\"vbscript";
const unsigned char header_bat[9] = "@echo off"; const unsigned char header_bat[9] = "@echo off";
const unsigned char header_vcf[11] = "begin:vcard"; const unsigned char header_vcf[11] = "begin:vcard";
@ -539,6 +539,7 @@ static int header_check_txt(const unsigned char *buffer, const unsigned int buff
const char sign_html[] = "<html"; const char sign_html[] = "<html";
const unsigned int buffer_size_test=(buffer_size < 2048 ? buffer_size : 2048); const unsigned int buffer_size_test=(buffer_size < 2048 ? buffer_size : 2048);
{ {
unsigned int i;
unsigned int tmp=0; unsigned int tmp=0;
for(i=0;i<10 && isdigit(buffer[i]);i++) for(i=0;i<10 && isdigit(buffer[i]);i++)
tmp=tmp*10+buffer[i]-'0'; tmp=tmp*10+buffer[i]-'0';
@ -568,7 +569,7 @@ static int header_check_txt(const unsigned char *buffer, const unsigned int buff
buffer_lower_size=buffer_size_test+16; buffer_lower_size=buffer_size_test+16;
buffer_lower=(char *)MALLOC(buffer_lower_size); buffer_lower=(char *)MALLOC(buffer_lower_size);
} }
i=UTF2Lat(buffer_lower,buffer,buffer_size_test); l=UTF2Lat(buffer_lower,buffer,buffer_size_test);
/* strncasecmp */ /* strncasecmp */
if(memcmp(buffer_lower,header_bat,sizeof(header_bat))==0) if(memcmp(buffer_lower,header_bat,sizeof(header_bat))==0)
{ {
@ -596,13 +597,13 @@ static int header_check_txt(const unsigned char *buffer, const unsigned int buff
} }
if(buffer[0]=='#' && buffer[1]=='!') if(buffer[0]=='#' && buffer[1]=='!')
{ {
unsigned int l=i-2; unsigned int ll=l-2;
const unsigned char *haystack=buffer_lower+2; const unsigned char *haystack=buffer_lower+2;
const unsigned char *res; const unsigned char *res;
res=memchr(haystack,'\n',l); res=memchr(haystack,'\n',ll);
if(res!=NULL) if(res!=NULL)
l=res-haystack; ll=res-haystack;
if(td_memmem(haystack, l, header_sig_perl, sizeof(header_sig_perl)) != NULL) if(td_memmem(haystack, ll, header_sig_perl, sizeof(header_sig_perl)) != NULL)
{ {
reset_file_recovery(file_recovery_new); reset_file_recovery(file_recovery_new);
file_recovery_new->data_check=&data_check_txt; file_recovery_new->data_check=&data_check_txt;
@ -610,7 +611,7 @@ static int header_check_txt(const unsigned char *buffer, const unsigned int buff
file_recovery_new->extension="pl"; file_recovery_new->extension="pl";
return 1; return 1;
} }
if(td_memmem(haystack, l, header_sig_python, sizeof(header_sig_python)) != NULL) if(td_memmem(haystack, ll, header_sig_python, sizeof(header_sig_python)) != NULL)
{ {
reset_file_recovery(file_recovery_new); reset_file_recovery(file_recovery_new);
file_recovery_new->data_check=&data_check_txt; file_recovery_new->data_check=&data_check_txt;
@ -618,7 +619,7 @@ static int header_check_txt(const unsigned char *buffer, const unsigned int buff
file_recovery_new->extension="py"; file_recovery_new->extension="py";
return 1; return 1;
} }
if(td_memmem(haystack, l, header_sig_ruby, sizeof(header_sig_ruby)) != NULL) if(td_memmem(haystack, ll, header_sig_ruby, sizeof(header_sig_ruby)) != NULL)
{ {
reset_file_recovery(file_recovery_new); reset_file_recovery(file_recovery_new);
file_recovery_new->data_check=&data_check_txt; file_recovery_new->data_check=&data_check_txt;
@ -643,31 +644,59 @@ static int header_check_txt(const unsigned char *buffer, const unsigned int buff
/* ind=~0: random /* ind=~0: random
* ind=~1: constant */ * ind=~1: constant */
double ind=1; double ind=1;
unsigned int nbr=0; unsigned int nbrf=0;
unsigned int is_csv=1;
/* Detect Fortran */ /* Detect Fortran */
{ {
char *str=buffer_lower; char *str=buffer_lower;
while((str=strstr(str, "\n "))!=NULL) while((str=strstr(str, "\n "))!=NULL)
{ {
nbr++; nbrf++;
str++; str++;
} }
} }
if(i>1) /* Detect csv */
{
unsigned int csv_per_line_current=0;
unsigned int csv_per_line=0;
unsigned int line_nbr=0;
unsigned int i;
for(i=0;i<l && is_csv>0;i++)
{
if(buffer_lower[i]==';')
{
csv_per_line_current++;
}
else if(buffer_lower[i]=='\n')
{
if(line_nbr==0)
csv_per_line=csv_per_line_current;
if(csv_per_line_current!=csv_per_line)
is_csv=0;
line_nbr++;
csv_per_line_current=0;
}
}
if(csv_per_line<1 || line_nbr<10)
is_csv=0;
}
if(l>1)
{ {
unsigned int stats[256]; unsigned int stats[256];
unsigned int j; unsigned int i;
memset(&stats, 0, sizeof(stats)); memset(&stats, 0, sizeof(stats));
for(j=0;j<i;j++) for(i=0;i<l;i++)
stats[buffer[j]]++; stats[(unsigned char)buffer_lower[i]]++;
ind=0; ind=0;
for(j=0;j<256;j++) for(i=0;i<256;i++)
if(stats[j]>0) if(stats[i]>0)
ind+=stats[j]*(stats[j]-1); ind+=stats[i]*(stats[i]-1);
ind=ind/i/(i-1); ind=ind/l/(l-1);
} }
if(nbr>10 && i<=0.90) if(nbrf>10 && ind<0.9)
ext="f"; ext="f";
else if(is_csv>0)
ext="csv";
/* Detect LaTeX, C, PHP, JSP, ASP, HTML, C header */ /* Detect LaTeX, C, PHP, JSP, ASP, HTML, C header */
else if(strstr(buffer_lower, sign_tex)!=NULL) else if(strstr(buffer_lower, sign_tex)!=NULL)
ext="tex"; ext="tex";
@ -683,9 +712,9 @@ static int header_check_txt(const unsigned char *buffer, const unsigned int buff
ext="asp"; ext="asp";
else if(strstr(buffer_lower, sign_html)!=NULL) else if(strstr(buffer_lower, sign_html)!=NULL)
ext="html"; ext="html";
else if(strstr(buffer_lower, sign_h)!=NULL && i>50) else if(strstr(buffer_lower, sign_h)!=NULL && l>50)
ext="h"; ext="h";
else if(i<100 || ind<0.03 || ind>0.90) else if(l<100 || ind<0.03 || ind>0.90)
ext=NULL; ext=NULL;
else else
ext=file_hint_txt.extension; ext=file_hint_txt.extension;
@ -712,15 +741,15 @@ Doc: \r (0xD)
if(file_recovery->file_stat->file_hint==&file_hint_doc && if(file_recovery->file_stat->file_hint==&file_hint_doc &&
strstr(file_recovery->filename,".doc")!=NULL) strstr(file_recovery->filename,".doc")!=NULL)
{ {
unsigned int j; unsigned int i;
unsigned int txt_nl=0; unsigned int txt_nl=0;
for(j=0;j<i-1;j++) for(i=0;i<l-1;i++)
if(buffer_lower[j]=='\r' && buffer_lower[j+1]!='\n') if(buffer_lower[i]=='\r' && buffer_lower[i+1]!='\n')
{ {
return 0; return 0;
} }
for(j=0;j<i && j<512;j++) for(i=0;i<l && i<512;i++)
if(buffer[j]=='\n') if(buffer_lower[i]=='\n')
txt_nl=1; txt_nl=1;
if(txt_nl==1) if(txt_nl==1)
{ {