Commit 6bc5f6c0 authored by duanjinfei's avatar duanjinfei

fix crawl data

parent ba1ac7ea
......@@ -37,13 +37,30 @@ async function fetchAllData(supabase, table: string, pageSize: number = 1000) {
// 缓存机制:减少重复数据库查询
class DatabaseCache {
private categoryCache = new Map<string, { id: string }>();
private appNameIdCache = new Map<number, string>();
private appNameIdCache = new Map<string, number>();
private validColumns = [];
private fileNamesSet;
async init(supabase, table: string) {
const appAllData = await fetchAllData(supabase, table)
for (const app of appAllData) {
this.appNameIdCache.set(app.app_id, app.name);
this.appNameIdCache.set(app.name, app.app_id);
}
const { data: app, error } = await supabase
.from(table)
.select("*")
.limit(1)
.single();
if (error) throw error;
delete app.id
// 提取列名
this.validColumns = this.getFirstLevelKeys(app)
}
async initFileData(supabase, bucket: string) {
const fileDatas = await this.getAllFiles(supabase, bucket);
// 创建一个 Set 用于快速查找文件名
this.fileNamesSet = new Set(fileDatas.map((file) => file.name));
}
async getCategoryByPrettyUrl(supabase, pretty_url: string) {
......@@ -60,7 +77,7 @@ class DatabaseCache {
}
async checkAppExists(appId: number, appName: string) {
return this.appNameIdCache.get(appId) === appName; // 通过 this.appNameIdCache 获取缓存值
return this.appNameIdCache.get(appName) === appId; // 通过 this.appNameIdCache 获取缓存值
}
async batchInsertApps(supabase, apps: any[]) {
......@@ -70,37 +87,142 @@ class DatabaseCache {
if (error) throw error;
}
}
cleanAppData(app) {
try {
// 动态清理对象
return Object.keys(app)
.filter(key => this.validColumns.includes(key))
.reduce((obj, key) => {
obj.app_id = app.ID;
obj[key] = app[key];
return obj;
}, {});
} catch (error) {
console.error('Error cleaning app data:', error);
return null;
}
}
function getFirstLevelKeys(obj) {
return Object.keys(obj);
}
getFirstLevelKeys(obj) {
return Object.keys(obj);
}
async function cleanAppData(app, supabase, tableName) {
try {
// 一次性获取所有列信息
const { data: app, error } = await supabase
.from(tableName)
.select("*")
.limit(1)
.single();
async getAllFiles(supabase, bucket) {
let allFiles = [];
let offset = 0;
const limit = 100; // 每次最多获取 100 个文件
if (error) throw error;
delete app.id
// 提取列名
const validColumns = getFirstLevelKeys(app)
// 动态清理对象
return Object.keys(app)
.filter(key => validColumns.includes(key))
.reduce((obj, key) => {
obj.app_id = app.ID;
obj[key] = app[key];
return obj;
}, {});
} catch (error) {
console.error('Error cleaning app data:', error);
return null;
try {
while (true) {
// 分页获取文件列表
const { data, error } = await supabase.storage.from(bucket).list("", {
limit,
offset,
});
if (error) {
console.error("获取文件列表失败:", error.message);
break;
}
// 将当前分页的数据追加到总列表中
allFiles = allFiles.concat(data);
// 如果获取的数据不足 limit,说明已经到最后一页
if (data.length < limit) {
break;
}
// 否则继续下一页
offset += limit;
}
return allFiles;
} catch (err) {
console.error("发生错误:", err.message);
return false;
}
}
async uploadIcon(supabase, element) {
if (element.icon.url) {
const iconUrl = element.icon.url.replace(/^\/+/, "");
let imageUrl = `https://ton.app/${iconUrl}`;
const response = await fetch(imageUrl);
if (!response.ok) {
console.error("Failed to fetch image:", response.statusText, imageUrl);
} else {
const blob = await response.blob(); // 将响应数据转为 Blob
const fileName = element.icon.url.split("/").pop();
const file = new File([blob], fileName, { type: blob.type }); // 将 Blob 转为 File 格式
let fileExists = await this.checkIsFileExist(
fileName
);
if (!fileExists) {
await this.uploadFileToStorage(supabase, file, fileName);
}
}
}
}
async uploadImages(supabase, element) {
if (element.images && element.images.length > 0) {
for (const image of element.images) {
if (image.url) {
let imageUrl = `https://ton.app/${image.url}`;
const response = await fetch(imageUrl);
if (!response.ok) {
console.error(
"Failed to fetch image:",
response.statusText,
imageUrl
);
} else {
const blob = await response.blob(); // 将响应数据转为 Blob
const fileName = image.url.split("/").pop();
const file = new File([blob], fileName, { type: blob.type }); // 将 Blob 转为 File 格式
let fileExists = await this.checkIsFileExist(
fileName
);
if (!fileExists) {
await this.uploadFileToStorage(file, fileName);
}
}
}
}
}
}
async uploadFileToStorage(supabase, file, fileName) {
try {
let refix =
"https://jokqrcagutpmvpilhcfq.supabase.co/storage/v1/object/public";
const { data, error } = await supabase.storage
.from("media")
.upload(`${fileName}`, file, {
cacheControl: "120",
contentType: "image/png",
upsert: false,
});
console.log("data:", data);
if (data != null) {
let url = `${refix}/${data.fullPath}`;
console.log("upload file success:", fileName);
return url;
}
if (error != null) {
console.log("upload file error:", fileName);
}
} catch (error) {
console.log("error:", error);
}
}
async checkIsFileExist(fileName) {
// 检查目标文件是否存在
const fileExists = this.fileNamesSet.has(fileName);
if (fileExists) {
console.log(`文件 "${fileName} ${fileExists}" 已存在`);
} else {
console.log(`文件 "${fileName} ${fileExists}" 不存在`);
}
return fileExists;
}
}
......@@ -141,7 +263,7 @@ async function getTonAppInfo(supabase) {
// 处理应用
for (let app of apps) {
// 清理应用数据
const cleanedApp = await cleanAppData(app, supabase, 'app');
const cleanedApp = databaseCache.cleanAppData(app);
if (!cleanedApp) continue;
// 检查应用是否已存在
......@@ -167,7 +289,12 @@ async function getTonAppInfo(supabase) {
// 批量插入应用
console.log("await app to insert length", appsToInsert.length);
if (appsToInsert.length > 0) {
await databaseache.batchInsertApps(supabase, appsToInsert);
await databaseCache.batchInsertApps(supabase, appsToInsert);
await databaseCache.initFileData(supabase, "media");
for (const element of appsToInsert) {
await databaseCache.uploadIcon(supabase, element);
await databaseCache.uploadImages(supabase, element);
}
}
return appsToInsert.length;
}
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment