在Nand Flash上建立UBIFS格式的檔案系統,用于存儲系統的曆史資料。
在實際調試過程中,發現UBIFS檔案系統會産生錯誤,輸出資訊如下:
[685108.022234] nand_erase_nand: attempt to erase a bad block at page 0x00008580
[685108.050087] nand_erase_nand: attempt to erase a bad block at page 0x00008580
[685108.070232] nand_erase_nand: attempt to erase a bad block at page 0x00008580
[685108.090388] nand_erase_nand: attempt to erase a bad block at page 0x00008580
[685108.097944] UBI error: do_sync_erase: cannot erase PEB 534, error -5
[685108.104750] UBI error: erase_worker: failed to erase PEB 534, error -5
[685108.111725] UBI: mark PEB 534 as bad
[685108.115647] UBI: 17 PEBs left in the reserve
[687322.014321] UBI: scrubbed PEB 1272 (LEB 0:1453), data moved to PEB 1855
[687774.705119] UBI error: ubi_io_read: error -74 (ECC error) while reading 126976 bytes from PEB 456:4096, read 126976 bytes
[687774.733301] UBIFS error (pid 10861): ubifs_check_node: bad CRC: calculated 0x7845a4da, read 0x907d6ad0
[687774.743302] UBIFS error (pid 10861): ubifs_check_node: bad node at LEB 1769:113376
[687774.751381] magic 0x6101831
[687774.751389] crc 0x907d6ad0
[687774.751398] node_type 1 (data node)
[687774.751407] group_type 0 (no node group)
[687774.751417] sqnum 64559590
[687774.751424] len 1667
[687774.751438] key (24037, data, 1463)
[687774.751448] size 4096
[687774.751455] compr_typ 1
[687774.751463] data size 1619
[687774.751469] data:
[687774.751486] 00000000: 0a 61 61 39 0a 30 30 31 65 37 61 61 63 0a c1 01 66 c2 01 62 32 e1 01 35 e1 01 38 e1 01 62 e1 01
[687774.942198] UBIFS error (pid 10861): ubifs_scan: bad node
[687774.947989] UBIFS error (pid 10861): ubifs_scanned_corruption: corruption at LEB 1769:113376
[687774.956997] UBIFS error (pid 10861): ubifs_scanned_corruption: first 8192 bytes from LEB 1769:113376
[687774.966778] 00000000: 06101831 907d6ad0 03d919e6 00000000 00000683 00000001 00005de5 200005b7 1....j}..................].....
[687774.973358] UBIFS error (pid 10861): ubifs_scan: LEB 1769 scanning failed
[687774.980645] UBIFS warning (pid 10861): ubifs_ro_mode: switched to read-only mode, error -117
[687775.162137] UBIFS error (pid 10861): ubifs_budget_space: cannot budget space, error -117
[687804.722203] UBIFS error (pid 11722): make_reservation: cannot reserve 160 bytes in jhead 1, error -30
[687804.732084] UBIFS error (pid 11722): ubifs_write_inode: can't write inode 74, error -30
分析為Nand Flash資料寫入錯誤,編寫測試程式對nand flash進行多任務的資料讀寫操作代碼如下:
nand_test.c
#include <stdlib.h>
#include <sys/types.h>
#include <unistd.h>
#include <fcntl.h>
#include<stdio.h>
int main(void)
{
FILE * fd = NULL;
char buff[100];
char tmp[1024];
int j,k ;
char buff1[10];
while(1)
{
for(k = 0; k < 40; k ++)
{
sprintf(tmp,"/data/test%d.dat",k);
fd = fopen(tmp, "wb");
if(!fd)
{
printf("Cannot open %s\n",tmp);
break;
}
for(j = 0; j < 1024*1024; j ++)
{
sprintf(buff,"%08x\n",k*j);
if(fwrite( buff, 9,1,fd) != 1)
{
printf("Write value to file:%s failed \n",tmp);
break;
}
}
fflush(fd);
fsync(fileno(fd));
fclose(fd);
}
for(k = 0; k < 40; k ++)
{
sprintf(tmp,"/data/test%d.dat",k);
fd = fopen(tmp, "r+");
if(!fd)
{
printf("Cannot open %s\n",tmp);
break;
}
for(j = 0; j < 1024*1024; j ++)
{
if(fread( buff, 9,1,fd) != 1)
{
printf("read file:%s failed \n",tmp);
break;
}
sprintf(buff1,"%08x\n",k*j);
if(memcmp(buff,buff1,9))
{
printf("check file:%s line %d failed \n",tmp,j);
}
}
// printf("check file finish:%s \n",tmp);
fclose(fd);
}
k--;
for(; k >= 0; k --)
{
sprintf(tmp,"/data/test%d.dat",k);
sprintf(buff,"rm %s",tmp);
system(buff);
}
printf("nand test success!~ \n");
}
return 0;
}
nand_test1.c
#include <stdlib.h>
#include <sys/types.h>
#include <unistd.h>
#include <fcntl.h>
#include<stdio.h>
int main(void)
{
FILE * fd = NULL;
char buff[100];
char tmp[1024];
int j,k ;
char buff1[10];
while(1)
{
for(k = 0; k < 10; k ++)
{
sprintf(tmp,"/data/test%d.dat",k + 40);
fd = fopen(tmp, "wb");
if(!fd)
{
printf("Cannot open %s\n",tmp);
break;
}
for(j = 0; j < 1024*1024; j ++)
{
sprintf(buff,"%08x\n",k*j);
if(fwrite( buff, 9,1,fd) != 1)
{
printf("Write value to file:%s failed \n",tmp);
break;
}
}
fflush(fd);
fsync(fileno(fd));
fclose(fd);
}
for(k = 0; k < 10; k ++)
{
sprintf(tmp,"/data/test%d.dat",k + 40);
fd = fopen(tmp, "r+");
if(!fd)
{
printf("Cannot open %s\n",tmp);
break;
}
for(j = 0; j < 1024*1024; j ++)
{
if(fread( buff, 9,1,fd) != 1)
{
printf("read file:%s failed \n",tmp);
break;
}
sprintf(buff1,"%08x\n",k*j);
if(memcmp(buff,buff1,9))
{
printf("check file:%s line %d failed \n",tmp,j);
}
}
// printf("check file finish:%s \n",tmp);
fclose(fd);
}
k--;
for(; k >= 0; k --)
{
sprintf(tmp,"/data/test%d.dat",k + 40);
sprintf(buff,"rm %s",tmp);
system(buff);
}
printf("nand test 1 success!~ \n");
}
return 0;
}
測試發現,在多任務操作Nand Flash的過程中,會有比較大的機率導緻nand flash錯誤。
在Linux Kernel的menuconfig中,配置選項 Device Drivers-> Memory Technology Device (MTD) support -> NAND Device Support->Verify NAND page writes 用來配置在nand flash寫入時,是否進行額外的校驗.
開啟Verify NAND page writes後,執行測試程式,輸出資訊如下:
[270907.433837] UBI error: ubi_io_write: error -5 while writing 2048 bytes to PEB 976:106496, written 0 bytes
[270907.444130] UBI warning: ubi_eba_write_leb: failed to write data to PEB 976
[270907.451584] UBI: recover PEB 976, move data to PEB 1456
[270907.650083] UBI: data was successfully recovered
[270907.656046] UBI: run torture test for PEB 976
[270908.300205] UBI: PEB 976 passed torture test, do not mark it as bad
[271145.827524] UBI error: ubi_io_write: error -5 while writing 2048 bytes to PEB 1048:57344, written 0 bytes
[271145.837783] UBI warning: ubi_eba_write_leb: failed to write data to PEB 1048
[271145.845346] UBI: recover PEB 1048, move data to PEB 1457
[271145.963098] UBI: data was successfully recovered
[271145.968262] UBI: run torture test for PEB 1048
[271146.740653] UBI: PEB 1048 passed torture test, do not mark it as bad
分析在開啟Verify NAND page writes,能有效減少nand flash出錯導緻的程式異常。
在長期穩定性的測試中發現,nand flash總是有可能會發生讀寫錯誤,導緻分區變化為隻讀分區。由于在實際應用中,nand flash分區存儲的為曆史資料資訊,是以希望在nand falsh讀寫錯誤後不修改為隻讀狀态。參照https://e2e.ti.com/support/embedded/linux/f/354/t/171839的内容,修改如下:
===================================================================
--- fs/ubifs/scan.c (revision 1897)
+++ fs/ubifs/scan.c (working copy)
@@ -339,7 +339,7 @@
if (!quiet)
ubifs_err("corrupt empty space at LEB %d:%d",
lnum, offs);
- goto corrupted;
+ //goto corrupted;
}
return sleb;
關于nand flash在am335x系列CPU上使用出現的這個問題,ti官方也沒有太好的解決方案,具體可以參照ti的官方論壇答複。
https://e2e.ti.com/support/embedded/linux/f/354/t/171839