Open
Description
import json
import os
from scrapegraphai.graphs import SmartScraperGraph
openai_api_key = os.getenv("OPENAI_API_KEY")
# Configuration for the scraping pipeline
graph_config = {
"llm": {
"api_key": openai_api_key,
"model": "openai/gpt-4-0125-preview",
},
"verbose": True,
"headless": True,
}
# Path to your local HTML file
html_file_path = "slack_html/0221-onboarding-bitbucket.html"
# Read and properly escape HTML content
with open(html_file_path, 'r', encoding='utf-8') as file:
html_content = json.dumps(file.read(), ensure_ascii=False)
# Create input dictionary with EXACT variable names from error message
inputs = {
'"content"': html_content, # Double-quoted key with JSON-escaped content
'question': "Extract all messages with their timestamps and usernames from the Slack conversation."
}
# Initialize the SmartScraperGraph
smart_scraper = SmartScraperGraph(
prompt=inputs,
source=html_content,
config=graph_config # Explicitly pass variables dictionary
)
# Create output directory if it doesn't exist
output_folder_path = "cleaned_slack_html"
os.makedirs(output_folder_path, exist_ok=True)
# Execute the scraping pipeline
try:
result = smart_scraper.run()
# Prepare output file path
output_file_name = "cleaned_" + os.path.basename(html_file_path).replace(".html", ".json")
output_file_path = os.path.join(output_folder_path, output_file_name)
# Save results
print("Extraction completed. Saving results...")
with open(output_file_path, "w", encoding='utf-8') as f:
json.dump(result, f, indent=4, ensure_ascii=False)
print(f"Results saved to: {output_file_path}")
except Exception as e:
print(f"An error occurred: {str(e)}")
keeps throwing something about content
--- Executing Fetch Node ---
--- (Fetching HTML from: "<div\n role=\"presentation\"\n class=\"c-message_kit__background p-message_pane_message__message c-message_kit__message p-message_pane_message__message--last\"\n data-qa=\"message_container\"\n data-qa-unprocessed=\"false\"\n data-qa-placeholder=\"false\"\n>\n <div\n role=\"document\"\n aria-roledescription=\"message\"\n class=\"c-message_kit__hover\"\n data-qa-hover=\"true\"\n >\n <div\n class=\"c-message_kit__actions c-message_kit__actions--default\"\n style=\"position: relative\"\n >\n <div class=\"c-message_kit__gutter\">\n <div\n role=\"presentation\"\n class=\"c-message_kit__gutter__left\"\n data-stringify-ignore=\"true\"\n >\n <span class=\"p-member_profile_hover_card\" role=\"presentation\"\n ><button\n class=\"c-button-unstyled c-message_kit__avatar c-avatar c-avatar--interactive\"\n aria-hidden=\"true\"\n aria-label=\"View Nobu’s Profile\"\n tabindex=\"-1\"\n type=\"button\"\n style=\"height: 36px; width: 36px\"\n >\n <span\n class=\"c-base_icon__width_only_container\"\n style=\"height: 36px; width: 36px\"\n ><img\n src=\"https://ca.slack-edge.com/TTMCKNRGW-U0807D9TGNM-91378e27e35d-48\"\n srcset=\"\n https://ca.slack-edge.com/TTMCKNRGW-U0807D9TGNM-91378e27e35d-72 2x\n \"\n class=\"c-base_icon c-base_icon--image\"\n aria-hidden=\"true\"\n role=\"img\"\n alt=\"\"\n style=\"width: 36px\"\n /></span></button\n ></span>\n </div>\n <div\n role=\"presentation\"\n class=\"c-message_kit__gutter__right\"\n data-qa=\"message_content\"\n >\n <span\n class=\"c-message__sender c-message_kit__sender\"\n data-qa=\"message_sender\"\n data-stringify-type=\"replace\"\n data-stringify-text=\"Nobu\"\n ><span class=\"p-member_profile_hover_card\" role=\"presentation\"\n ><button\n data-message-sender=\"U0807D9TGNM\"\n data-qa=\"message_sender_name\"\n class=\"c-link--button c-message__sender_button\"\n type=\"button\"\n tabindex=\"0\"\n >\n Nobu\n </button></span\n ><span\n id=\"primary-C07UVPLJW4E-1740101333.214529-sender\"\n class=\"offscreen\"\n aria-hidden=\"true\"\n data-qa=\"aria-labelledby-primary-C07UVPLJW4E-1740101333.214529-sender\"\n >Nobu</span\n ></span\n > <a\n aria-label=\"Today at 10:28:53 AM\"\n data-stringify-type=\"replace\"\n data-stringify-text=\"[10:28 AM]\"\n data-stringify-requires-siblings=\"true\"\n data-ts=\"1740101333.214529\"\n delay=\"300\"\n data-sk=\"tooltip_parent\"\n class=\"c-link c-timestamp\"\n href=\"https://c-2-c-group.slack.com/archives/C07UVPLJW4E/p1740101333214529\"\n ><span class=\"c-timestamp__label\" data-qa=\"timestamp_label\"\n >10:28 AM</span\n ></a\n ><br />\n <div class=\"c-message_kit__blocks c-message_kit__blocks--rich_text\">\n <div\n class=\"c-message__message_blocks c-message__message_blocks--rich_text\"\n data-qa=\"message-text\"\n >\n <div class=\"p-block_kit_renderer\" data-qa=\"block-kit-renderer\">\n <div\n class=\"p-block_kit_renderer__block_wrapper p-block_kit_renderer__block_wrapper--first\"\n >\n <div class=\"p-rich_text_block\" dir=\"auto\">\n <div class=\"p-rich_text_section\">\n 件名:\n Bitbucketアクセス制限についての正式な説明を求めます<span\n aria-label=\"\"\n class=\"c-mrkdwn__br\"\n data-stringify-type=\"paragraph-break\"\n ></span\n >CFO羽嶋様<br />お世話になっております。Nobuです。<span\n aria-label=\"\"\n class=\"c-mrkdwn__br\"\n data-stringify-type=\"paragraph-break\"\n ></span\n ****filler******** data-qa=\"file_image_thumbnail_img\"\n /></div\n ></a>\n </div>\n </div>\n </div>\n </div>\n </div>\n <div class=\"resize-triggers\">\n <div class=\"expand-trigger\">\n <div style=\"width: 494px; height: 229px\"></div>\n </div>\n <div class=\"contract-trigger\"></div>\n </div>\n </div>\n </div>\n </div>\n </div>\n </div>\n</div>\n") ---
--- Executing ParseNode Node ---
--- Executing GenerateAnswer Node ---
Error during chain execution: 'Input to PromptTemplate is missing variables {\'"content"\'}. Expected: [\'"content"\', \'question\'] Received: [\'question\']\nNote: if you intended {"content"} to be part of the string and not a variable, please escape it with double curly braces like: \'{{"content"}}\'.\nFor troubleshooting, visit: https://python.langchain.com/docs/troubleshooting/errors/INVALID_PROMPT_INPUT '
An error occurred: 'Input to PromptTemplate is missing variables {\'"content"\'}. Expected: [\'"content"\', \'question\'] Received: [\'question\']\nNote: if you intended {"content"} to be part of the string and not a variable, please escape it with double curly braces like: \'{{"content"}}\'.\nFor troubleshooting, visit: https://python.langchain.com/docs/troubleshooting/errors/INVALID_PROMPT_INPUT '