Skip to content

Commit

Permalink
fix: document truncation and loss in notion document sync (#5631)
Browse files Browse the repository at this point in the history
Co-authored-by: Aurelius Huang <cm.huang@aftership.com>
  • Loading branch information
Aurelius-Huang and Aurelius Huang committed Jul 5, 2024
1 parent f8aaa57 commit f546db5
Showing 1 changed file with 15 additions and 16 deletions.
31 changes: 15 additions & 16 deletions api/core/rag/extractor/notion_extractor.py
Original file line number Diff line number Diff line change
Expand Up @@ -140,11 +140,10 @@ def _get_notion_database_data(

def _get_notion_block_data(self, page_id: str) -> list[str]:
result_lines_arr = []
cur_block_id = page_id
start_cursor = None
block_url = BLOCK_CHILD_URL_TMPL.format(block_id=page_id)
while True:
block_url = BLOCK_CHILD_URL_TMPL.format(block_id=cur_block_id)
query_dict: dict[str, Any] = {}

query_dict: dict[str, Any] = {} if not start_cursor else {'start_cursor': start_cursor}
res = requests.request(
"GET",
block_url,
Expand All @@ -153,7 +152,7 @@ def _get_notion_block_data(self, page_id: str) -> list[str]:
"Content-Type": "application/json",
"Notion-Version": "2022-06-28",
},
json=query_dict
params=query_dict
)
data = res.json()
for result in data["results"]:
Expand Down Expand Up @@ -191,16 +190,16 @@ def _get_notion_block_data(self, page_id: str) -> list[str]:
if data["next_cursor"] is None:
break
else:
cur_block_id = data["next_cursor"]
start_cursor = data["next_cursor"]
return result_lines_arr

def _read_block(self, block_id: str, num_tabs: int = 0) -> str:
"""Read a block."""
result_lines_arr = []
cur_block_id = block_id
start_cursor = None
block_url = BLOCK_CHILD_URL_TMPL.format(block_id=block_id)
while True:
block_url = BLOCK_CHILD_URL_TMPL.format(block_id=cur_block_id)
query_dict: dict[str, Any] = {}
query_dict: dict[str, Any] = {} if not start_cursor else {'start_cursor': start_cursor}

res = requests.request(
"GET",
Expand All @@ -210,7 +209,7 @@ def _read_block(self, block_id: str, num_tabs: int = 0) -> str:
"Content-Type": "application/json",
"Notion-Version": "2022-06-28",
},
json=query_dict
params=query_dict
)
data = res.json()
if 'results' not in data or data["results"] is None:
Expand Down Expand Up @@ -249,7 +248,7 @@ def _read_block(self, block_id: str, num_tabs: int = 0) -> str:
if data["next_cursor"] is None:
break
else:
cur_block_id = data["next_cursor"]
start_cursor = data["next_cursor"]

result_lines = "\n".join(result_lines_arr)
return result_lines
Expand All @@ -258,10 +257,10 @@ def _read_table_rows(self, block_id: str) -> str:
"""Read table rows."""
done = False
result_lines_arr = []
cur_block_id = block_id
start_cursor = None
block_url = BLOCK_CHILD_URL_TMPL.format(block_id=block_id)
while not done:
block_url = BLOCK_CHILD_URL_TMPL.format(block_id=cur_block_id)
query_dict: dict[str, Any] = {}
query_dict: dict[str, Any] = {} if not start_cursor else {'start_cursor': start_cursor}

res = requests.request(
"GET",
Expand All @@ -271,7 +270,7 @@ def _read_table_rows(self, block_id: str) -> str:
"Content-Type": "application/json",
"Notion-Version": "2022-06-28",
},
json=query_dict
params=query_dict
)
data = res.json()
# get table headers text
Expand Down Expand Up @@ -300,7 +299,7 @@ def _read_table_rows(self, block_id: str) -> str:
done = True
break
else:
cur_block_id = data["next_cursor"]
start_cursor = data["next_cursor"]

result_lines = "\n".join(result_lines_arr)
return result_lines
Expand Down

0 comments on commit f546db5

Please sign in to comment.