Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

fix: document truncation and loss in notion document sync #5631

Merged
merged 1 commit into from
Jul 5, 2024
Merged
Changes from all commits
Commits
File filter

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
fix: notion extractor only retrieves the first page of many blocks, a…
…nd the subsequent blocks are lost.
  • Loading branch information
Aurelius Huang committed Jun 26, 2024
commit 45cc5c5281a9972c68afc0d5325914e3e6f0bff6
31 changes: 15 additions & 16 deletions api/core/rag/extractor/notion_extractor.py
Original file line number Diff line number Diff line change
Expand Up @@ -138,11 +138,10 @@ def _get_notion_database_data(

def _get_notion_block_data(self, page_id: str) -> list[str]:
result_lines_arr = []
cur_block_id = page_id
start_cursor = None
block_url = BLOCK_CHILD_URL_TMPL.format(block_id=page_id)
while True:
block_url = BLOCK_CHILD_URL_TMPL.format(block_id=cur_block_id)
query_dict: dict[str, Any] = {}

query_dict: dict[str, Any] = {} if not start_cursor else {'start_cursor': start_cursor}
res = requests.request(
"GET",
block_url,
Expand All @@ -151,7 +150,7 @@ def _get_notion_block_data(self, page_id: str) -> list[str]:
"Content-Type": "application/json",
"Notion-Version": "2022-06-28",
},
json=query_dict
params=query_dict
)
data = res.json()
# current block's heading
Expand Down Expand Up @@ -194,16 +193,16 @@ def _get_notion_block_data(self, page_id: str) -> list[str]:
if data["next_cursor"] is None:
break
else:
cur_block_id = data["next_cursor"]
start_cursor = data["next_cursor"]
return result_lines_arr

def _read_block(self, block_id: str, num_tabs: int = 0) -> str:
"""Read a block."""
result_lines_arr = []
cur_block_id = block_id
start_cursor = None
block_url = BLOCK_CHILD_URL_TMPL.format(block_id=block_id)
while True:
block_url = BLOCK_CHILD_URL_TMPL.format(block_id=cur_block_id)
query_dict: dict[str, Any] = {}
query_dict: dict[str, Any] = {} if not start_cursor else {'start_cursor': start_cursor}

res = requests.request(
"GET",
Expand All @@ -213,7 +212,7 @@ def _read_block(self, block_id: str, num_tabs: int = 0) -> str:
"Content-Type": "application/json",
"Notion-Version": "2022-06-28",
},
json=query_dict
params=query_dict
)
data = res.json()
if 'results' not in data or data["results"] is None:
Expand Down Expand Up @@ -255,7 +254,7 @@ def _read_block(self, block_id: str, num_tabs: int = 0) -> str:
if data["next_cursor"] is None:
break
else:
cur_block_id = data["next_cursor"]
start_cursor = data["next_cursor"]

result_lines = "\n".join(result_lines_arr)
return result_lines
Expand All @@ -264,10 +263,10 @@ def _read_table_rows(self, block_id: str) -> str:
"""Read table rows."""
done = False
result_lines_arr = []
cur_block_id = block_id
start_cursor = None
block_url = BLOCK_CHILD_URL_TMPL.format(block_id=block_id)
while not done:
block_url = BLOCK_CHILD_URL_TMPL.format(block_id=cur_block_id)
query_dict: dict[str, Any] = {}
query_dict: dict[str, Any] = {} if not start_cursor else {'start_cursor': start_cursor}

res = requests.request(
"GET",
Expand All @@ -277,7 +276,7 @@ def _read_table_rows(self, block_id: str) -> str:
"Content-Type": "application/json",
"Notion-Version": "2022-06-28",
},
json=query_dict
params=query_dict
)
data = res.json()
# get table headers text
Expand Down Expand Up @@ -306,7 +305,7 @@ def _read_table_rows(self, block_id: str) -> str:
done = True
break
else:
cur_block_id = data["next_cursor"]
start_cursor = data["next_cursor"]

result_lines = "\n".join(result_lines_arr)
return result_lines
Expand Down