I wanted to download all the photos from a Pingboard directory for a project. Pingboard is an employee directory service that displays staff photos and information, but there’s no built-in way to bulk export the photos.
This script uses Playwright to connect to an existing browser session with Chrome Remote Debugging (avoiding login complications), scrapes the employee directory page to extract photo URLs and employee information, then downloads all the photos with descriptive filenames.
The trickiest part was handling Pingboard’s dynamic loading - the directory uses JavaScript to render the employee grid, so I had to wait for the content to load and execute JavaScript to extract the data. The script also handles pagination by automatically clicking “More” buttons to load additional employees.
I used Python with async/await throughout to make the downloads concurrent and faster. The script saves a JSON log of all the download attempts, making it easy to retry failed downloads or process the data later.
To use it, start Chrome with remote debugging enabled (chrome --remote-debugging-port=9222
), log into Pingboard in that browser, then run the script.
This was written with Claude (Opus 4 and a bit of Sonnet cleanup).
#!/usr/bin/env -S uv run --script
# /// script
# requires-python = ">=3.8"
# dependencies = [
# "playwright>=1.40.0",
# "aiohttp>=3.8.0",
# "aiofiles>=23.0.0",
# "requests>=2.28.0",
# ]
# ///
import asyncio
import os
import json
import sys
from playwright.async_api import async_playwright
import aiohttp
import aiofiles
from datetime import datetime
def detect_image_format(content_type, url, content):
"""Detect image format from content-type, URL, or content"""
if content_type:
if 'svg' in content_type.lower():
return 'svg'
elif 'png' in content_type.lower():
return 'png'
elif 'gif' in content_type.lower():
return 'gif'
elif 'webp' in content_type.lower():
return 'webp'
elif 'jpeg' in content_type.lower() or 'jpg' in content_type.lower():
return 'jpg'
# Check URL extension
url_lower = url.lower()
if url_lower.endswith('.svg'):
return 'svg'
elif url_lower.endswith('.png'):
return 'png'
elif url_lower.endswith('.gif'):
return 'gif'
elif url_lower.endswith('.webp'):
return 'webp'
elif url_lower.endswith('.jpg') or url_lower.endswith('.jpeg'):
return 'jpg'
# Check content headers (magic bytes)
if content:
if content.startswith(b'<svg') or content.startswith(b'<?xml'):
return 'svg'
elif content.startswith(b'\x89PNG'):
return 'png'
elif content.startswith(b'GIF8'):
return 'gif'
elif content.startswith(b'RIFF') and b'WEBP' in content[:12]:
return 'webp'
elif content.startswith(b'\xff\xd8\xff'):
return 'jpg'
# Default to jpg if we can't detect
return 'jpg'
async def download_image(session, url, base_filepath):
"""Download an image from URL to filepath with auto-detected extension"""
try:
async with session.get(url) as response:
if response.status == 200:
content = await response.read()
content_type = response.headers.get('content-type', '')
# Detect the actual image format
image_format = detect_image_format(content_type, url, content)
# Update filepath with correct extension
base_name = os.path.splitext(base_filepath)[0]
filepath = f"{base_name}.{image_format}"
async with aiofiles.open(filepath, 'wb') as f:
await f.write(content)
return True, image_format, os.path.basename(filepath)
except Exception as e:
print(f"Error downloading {url}: {e}")
return False, None, None
def build_directory_url(domain):
"""Build the directory URL from a domain, handling various input formats"""
# Remove protocol if present
if domain.startswith('http://') or domain.startswith('https://'):
domain = domain.split('://', 1)[1]
# Remove trailing slash and /users/directory if present
domain = domain.rstrip('/')
if domain.endswith('/users/directory'):
domain = domain[:-len('/users/directory')]
elif domain.endswith('/users'):
domain = domain[:-len('/users')]
# Build full URL
return f"https://{domain}/users/directory"
async def scrape_pingboard_photos(pingboard_domain):
# Build the directory URL
directory_url = build_directory_url(pingboard_domain)
print(f"Target URL: {directory_url}")
# Create output directory with timestamp and domain
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
domain_name = pingboard_domain.split('.')[0].replace('https://', '').replace('http://', '')
output_dir = f"pingboard_photos_{domain_name}_{timestamp}"
os.makedirs(output_dir, exist_ok=True)
# Create a log file for the scraping session
log_file = os.path.join(output_dir, "download_log.json")
async with async_playwright() as p:
# Connect to existing Chrome instance with remote debugging
# First, start Chrome with: chrome --remote-debugging-port=9222
print("Connecting to Chrome DevTools on port 9222...")
print("Make sure Chrome is running with: chrome --remote-debugging-port=9222")
try:
browser = await p.chromium.connect_over_cdp("http://localhost:9222")
except Exception as e:
print(f"Failed to connect to Chrome: {e}")
print("\nTo start Chrome with debugging:")
print("Windows: chrome.exe --remote-debugging-port=9222")
print("Mac: /Applications/Google\\ Chrome.app/Contents/MacOS/Google\\ Chrome --remote-debugging-port=9222")
print("Linux: google-chrome --remote-debugging-port=9222")
return
contexts = browser.contexts
if not contexts:
print("No browser contexts found.")
return
context = contexts[0]
pages = context.pages
page = pages[0] if pages else await context.new_page()
# Navigate to directory
current_url = page.url
if directory_url not in current_url:
print(f"Navigating to {directory_url}...")
await page.goto(directory_url, wait_until="networkidle")
await asyncio.sleep(3)
print("Extracting user data from directory...")
# Execute JavaScript to extract all user data at once
users_data = await page.evaluate("""
() => {
// Find the main grid container
const userGrid = document.querySelector('ul.grid') ||
document.querySelector('ul[class*="grid"]');
if (!userGrid) {
return { error: "Could not find user grid" };
}
// Get all user list items
const userListItems = userGrid.querySelectorAll('li');
// Extract data from each user
const users = Array.from(userListItems).map((li, index) => {
const link = li.querySelector('a[href*="/users/"]');
const img = li.querySelector('img');
if (!link || !img) return null;
// Extract user ID from URL
const userId = link.href.split('/users/')[1];
// Get name from alt text (cleaner than parsing fullText)
const name = img.alt || '';
// Get full text for job title extraction
const fullText = li.textContent.trim();
// Try to extract job title by removing the name from fullText
let jobTitle = fullText;
if (name && fullText.startsWith(name)) {
jobTitle = fullText.substring(name.length).trim();
}
return {
index,
userId,
profileUrl: link.href,
imageUrl: img.src,
name: name,
jobTitle: jobTitle,
fullText: fullText
};
}).filter(user => user !== null);
return {
totalFound: userListItems.length,
validUsers: users.length,
users: users
};
}
""")
if 'error' in users_data:
print(f"Error: {users_data['error']}")
await browser.close()
return
print(f"Found {users_data['validUsers']} valid users out of {users_data['totalFound']} total entries")
# Prepare download summary
download_summary = {
"timestamp": timestamp,
"pingboard_domain": pingboard_domain,
"directory_url": directory_url,
"total_users": users_data['validUsers'],
"downloads": []
}
# Download images
successful = 0
failed = 0
async with aiohttp.ClientSession() as session:
for i, user in enumerate(users_data['users'], 1):
name = user['name']
if not name:
print(f" ⚠ Skipping user {user['userId']} - no name found")
failed += 1
continue
# Clean filename (without extension - will be added based on detected format)
base_filename = name
# Remove any invalid filename characters
base_filename = "".join(c for c in base_filename if c.isalnum() or c in (' ', '-', '_')).strip()
base_filepath = os.path.join(output_dir, base_filename)
print(f"[{i}/{users_data['validUsers']}] Downloading: {name}")
success, image_format, final_filename = await download_image(session, user['imageUrl'], base_filepath)
download_info = {
"name": name,
"filename": final_filename,
"image_format": image_format,
"userId": user['userId'],
"profileUrl": user['profileUrl'],
"imageUrl": user['imageUrl'],
"jobTitle": user['jobTitle'],
"success": success
}
download_summary['downloads'].append(download_info)
if success:
format_indicator = f" ({image_format.upper()})" if image_format != 'jpg' else ""
print(f" ✓ Saved as {final_filename}{format_indicator}")
successful += 1
else:
print(f" ✗ Failed to download")
failed += 1
# Small delay between downloads
await asyncio.sleep(0.5)
# Save download summary
with open(log_file, 'w', encoding='utf-8') as f:
json.dump(download_summary, f, indent=2, ensure_ascii=False)
await browser.close()
print(f"\n{'='*50}")
print(f"Download complete!")
print(f"✓ Successful: {successful}")
print(f"✗ Failed: {failed}")
print(f"📁 Photos saved in: {output_dir}")
print(f"📄 Log file: {log_file}")
print(f"{'='*50}")
# Additional utility function to process existing directory data
def process_directory_json(json_file, output_dir="processed_photos"):
"""
Process a JSON file containing user data and download photos.
Useful if you've already extracted the data and want to download later.
"""
import json
import requests
os.makedirs(output_dir, exist_ok=True)
with open(json_file, 'r', encoding='utf-8') as f:
data = json.load(f)
users = data.get('users', [])
print(f"Processing {len(users)} users from JSON...")
successful = 0
failed = 0
for i, user in enumerate(users, 1):
name = user.get('altText', '') or user.get('name', '')
if not name:
print(f"Skipping user {i} - no name found")
failed += 1
continue
base_filename = name
base_filename = "".join(c for c in base_filename if c.isalnum() or c in (' ', '-', '_')).strip()
print(f"[{i}/{len(users)}] Downloading: {name}")
try:
response = requests.get(user['imageUrl'], timeout=30)
if response.status_code == 200:
content = response.content
content_type = response.headers.get('content-type', '')
# Detect image format
image_format = detect_image_format(content_type, user['imageUrl'], content)
filename = f"{base_filename}.{image_format}"
filepath = os.path.join(output_dir, filename)
with open(filepath, 'wb') as f:
f.write(content)
format_indicator = f" ({image_format.upper()})" if image_format != 'jpg' else ""
print(f" ✓ Saved as {filename}{format_indicator}")
successful += 1
else:
print(f" ✗ Failed with status {response.status_code}")
failed += 1
except Exception as e:
print(f" ✗ Error: {e}")
failed += 1
print(f"\nComplete! Successful: {successful}, Failed: {failed}")
def print_usage():
"""Print usage information"""
print("Usage: python scrape_pingboard.py <pingboard_domain>")
print("\nExamples:")
print(" python scrape_pingboard.py yourcompany.pingboard.com")
print(" python scrape_pingboard.py https://yourcompany.pingboard.com")
print(" python scrape_pingboard.py yourcompany.pingboard.com/users/directory")
print("\nThe script will automatically add /users/directory to the URL if needed.")
# Run the scraper
if __name__ == "__main__":
if len(sys.argv) != 2:
print_usage()
sys.exit(1)
pingboard_domain = sys.argv[1]
# Validate that it looks like a pingboard domain
if 'pingboard.com' not in pingboard_domain:
print(f"Warning: '{pingboard_domain}' doesn't appear to be a Pingboard domain.")
response = input("Continue anyway? (y/N): ")
if response.lower() not in ['y', 'yes']:
print("Aborted.")
sys.exit(1)
print(f"Starting scraper for: {pingboard_domain}")
asyncio.run(scrape_pingboard_photos(pingboard_domain))