Skip to content

can I scrape a local html file? #929

Open
@nyck33

Description

@nyck33
import json
import os
from scrapegraphai.graphs import SmartScraperGraph

openai_api_key = os.getenv("OPENAI_API_KEY")

# Configuration for the scraping pipeline
graph_config = {
    "llm": {
        "api_key": openai_api_key,
        "model": "openai/gpt-4-0125-preview",
    },
    "verbose": True,
    "headless": True,
}

# Path to your local HTML file
html_file_path = "slack_html/0221-onboarding-bitbucket.html"

# Read and properly escape HTML content
with open(html_file_path, 'r', encoding='utf-8') as file:
    html_content = json.dumps(file.read(), ensure_ascii=False)

# Create input dictionary with EXACT variable names from error message
inputs = {
    '"content"': html_content,  # Double-quoted key with JSON-escaped content
    'question': "Extract all messages with their timestamps and usernames from the Slack conversation."
}

# Initialize the SmartScraperGraph
smart_scraper = SmartScraperGraph(
    prompt=inputs,
    source=html_content,
    config=graph_config  # Explicitly pass variables dictionary
)

# Create output directory if it doesn't exist
output_folder_path = "cleaned_slack_html"
os.makedirs(output_folder_path, exist_ok=True)

# Execute the scraping pipeline
try:
    result = smart_scraper.run()
    
    # Prepare output file path
    output_file_name = "cleaned_" + os.path.basename(html_file_path).replace(".html", ".json")
    output_file_path = os.path.join(output_folder_path, output_file_name)
    
    # Save results
    print("Extraction completed. Saving results...")
    with open(output_file_path, "w", encoding='utf-8') as f:
        json.dump(result, f, indent=4, ensure_ascii=False)
    
    print(f"Results saved to: {output_file_path}")

except Exception as e:
    print(f"An error occurred: {str(e)}")

keeps throwing something about content

--- Executing Fetch Node ---
--- (Fetching HTML from: "<div\n  role=\"presentation\"\n  class=\"c-message_kit__background p-message_pane_message__message c-message_kit__message p-message_pane_message__message--last\"\n  data-qa=\"message_container\"\n  data-qa-unprocessed=\"false\"\n  data-qa-placeholder=\"false\"\n>\n  <div\n    role=\"document\"\n    aria-roledescription=\"message\"\n    class=\"c-message_kit__hover\"\n    data-qa-hover=\"true\"\n  >\n    <div\n      class=\"c-message_kit__actions c-message_kit__actions--default\"\n      style=\"position: relative\"\n    >\n      <div class=\"c-message_kit__gutter\">\n        <div\n          role=\"presentation\"\n          class=\"c-message_kit__gutter__left\"\n          data-stringify-ignore=\"true\"\n        >\n          <span class=\"p-member_profile_hover_card\" role=\"presentation\"\n            ><button\n              class=\"c-button-unstyled c-message_kit__avatar c-avatar c-avatar--interactive\"\n              aria-hidden=\"true\"\n              aria-label=\"View Nobu’s Profile\"\n              tabindex=\"-1\"\n              type=\"button\"\n              style=\"height: 36px; width: 36px\"\n            >\n              <span\n                class=\"c-base_icon__width_only_container\"\n                style=\"height: 36px; width: 36px\"\n                ><img\n                  src=\"https://ca.slack-edge.com/TTMCKNRGW-U0807D9TGNM-91378e27e35d-48\"\n                  srcset=\"\n                    https://ca.slack-edge.com/TTMCKNRGW-U0807D9TGNM-91378e27e35d-72 2x\n                  \"\n                  class=\"c-base_icon c-base_icon--image\"\n                  aria-hidden=\"true\"\n                  role=\"img\"\n                  alt=\"\"\n                  style=\"width: 36px\"\n              /></span></button\n          ></span>\n        </div>\n        <div\n          role=\"presentation\"\n          class=\"c-message_kit__gutter__right\"\n          data-qa=\"message_content\"\n        >\n          <span\n            class=\"c-message__sender c-message_kit__sender\"\n            data-qa=\"message_sender\"\n            data-stringify-type=\"replace\"\n            data-stringify-text=\"Nobu\"\n            ><span class=\"p-member_profile_hover_card\" role=\"presentation\"\n              ><button\n                data-message-sender=\"U0807D9TGNM\"\n                data-qa=\"message_sender_name\"\n                class=\"c-link--button c-message__sender_button\"\n                type=\"button\"\n                tabindex=\"0\"\n              >\n                Nobu\n              </button></span\n            ><span\n              id=\"primary-C07UVPLJW4E-1740101333.214529-sender\"\n              class=\"offscreen\"\n              aria-hidden=\"true\"\n              data-qa=\"aria-labelledby-primary-C07UVPLJW4E-1740101333.214529-sender\"\n              >Nobu</span\n            ></span\n          >&nbsp;&nbsp;<a\n            aria-label=\"Today at 10:28:53 AM\"\n            data-stringify-type=\"replace\"\n            data-stringify-text=\"[10:28 AM]\"\n            data-stringify-requires-siblings=\"true\"\n            data-ts=\"1740101333.214529\"\n            delay=\"300\"\n            data-sk=\"tooltip_parent\"\n            class=\"c-link c-timestamp\"\n            href=\"https://c-2-c-group.slack.com/archives/C07UVPLJW4E/p1740101333214529\"\n            ><span class=\"c-timestamp__label\" data-qa=\"timestamp_label\"\n              >10:28 AM</span\n            ></a\n          ><br />\n          <div class=\"c-message_kit__blocks c-message_kit__blocks--rich_text\">\n            <div\n              class=\"c-message__message_blocks c-message__message_blocks--rich_text\"\n              data-qa=\"message-text\"\n            >\n              <div class=\"p-block_kit_renderer\" data-qa=\"block-kit-renderer\">\n                <div\n                  class=\"p-block_kit_renderer__block_wrapper p-block_kit_renderer__block_wrapper--first\"\n                >\n                  <div class=\"p-rich_text_block\" dir=\"auto\">\n                    <div class=\"p-rich_text_section\">\n                      件名:\n                      Bitbucketアクセス制限についての正式な説明を求めます<span\n                        aria-label=\"\"\n                        class=\"c-mrkdwn__br\"\n                        data-stringify-type=\"paragraph-break\"\n                      ></span\n                      >CFO羽嶋様<br />お世話になっております。Nobuです。<span\n                        aria-label=\"\"\n                        class=\"c-mrkdwn__br\"\n                        data-stringify-type=\"paragraph-break\"\n                      ></span\n                   ****filler********                data-qa=\"file_image_thumbnail_img\"\n                          /></div\n                      ></a>\n                    </div>\n                  </div>\n                </div>\n              </div>\n            </div>\n            <div class=\"resize-triggers\">\n              <div class=\"expand-trigger\">\n                <div style=\"width: 494px; height: 229px\"></div>\n              </div>\n              <div class=\"contract-trigger\"></div>\n            </div>\n          </div>\n        </div>\n      </div>\n    </div>\n  </div>\n</div>\n") ---
--- Executing ParseNode Node ---
--- Executing GenerateAnswer Node ---
Error during chain execution: 'Input to PromptTemplate is missing variables {\'"content"\'}.  Expected: [\'"content"\', \'question\'] Received: [\'question\']\nNote: if you intended {"content"} to be part of the string and not a variable, please escape it with double curly braces like: \'{{"content"}}\'.\nFor troubleshooting, visit: https://python.langchain.com/docs/troubleshooting/errors/INVALID_PROMPT_INPUT '
An error occurred: 'Input to PromptTemplate is missing variables {\'"content"\'}.  Expected: [\'"content"\', \'question\'] Received: [\'question\']\nNote: if you intended {"content"} to be part of the string and not a variable, please escape it with double curly braces like: \'{{"content"}}\'.\nFor troubleshooting, visit: https://python.langchain.com/docs/troubleshooting/errors/INVALID_PROMPT_INPUT '

Metadata

Metadata

Assignees

No one assigned

    Labels

    bugSomething isn't workingstaleIssue has not had recent activity or appears to be solved. Stale issues will be automatically closed

    Type

    No type

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions