Beauty Pageant Data Scraper

Collect and analyze Vietnamese beauty pageant data

Scraper Configuration
Hold Ctrl/Cmd to select multiple pageants
Scraper Code Overview

The TALENTBook Beauty Pageant Scraper is built with Python and includes the following components:

Core Scraper Class
class VietnamBeautyPageantScraper:
    def __init__(self):
        self.base_url = "https://en.wikipedia.org/wiki/"
        self.pageant_types = [
            "Miss_Universe_Vietnam",
            "Miss_Vietnam",
            "Miss_World_Vietnam",
            "Miss_Grand_Vietnam",
            "Miss_Earth_Vietnam",
            "Miss_International_Vietnam"
        ]
        self.current_year = datetime.now().year
        self.start_year = self.current_year - 10
        # ... additional initialization code ...
        
    def scrape_pageant_data(self):
        # Main method to scrape all pageant data
        # ... implementation ...
        
    def extract_contestant_information(self, soup, pageant_type, year):
        # Extract detailed contestant information
        # ... implementation ...
        
    def collect_social_media_profiles(self, contestant_name):
        # Find and collect social media profiles for contestants
        # ... implementation ...
        
    def generate_rankings(self):
        # Generate various rankings based on collected data
        # ... implementation ...
        
    def generate_top_performer_personas(self):
        # Create detailed personas for top performers
        # ... implementation ...
Data Processing Pipeline
def process_pageant_data():
    # 1. Initialize the scraper
    scraper = VietnamBeautyPageantScraper()
    
    # 2. Scrape all pageant data
    pageant_data = scraper.scrape_pageant_data()
    
    # 3. Generate event rankings
    rankings = scraper.generate_rankings()
    
    # 4. Generate pageant calendar
    calendar = scraper.generate_calendar()
    
    # 5. Create winner history
    winners = scraper.create_winner_history()
    
    # 6. Generate top performer personas
    personas = scraper.generate_top_performer_personas()
    
    # 7. Export all data to structured formats
    export_data_to_csv(pageant_data, rankings, calendar, winners, personas)
    
    return {
        'pageant_data': pageant_data,
        'rankings': rankings,
        'calendar': calendar,
        'winners': winners,
        'personas': personas
    }
Integration with TALENTBook Platform
def integrate_with_talentbook(scraped_data):
    # 1. Connect to TALENTBook database
    db = connect_to_database()
    
    # 2. Import pageant events
    import_pageant_events(db, scraped_data['pageant_data'])
    
    # 3. Import contestants
    import_contestants(db, scraped_data['pageant_data'])
    
    # 4. Import social profiles
    import_social_profiles(db, scraped_data['pageant_data'])
    
    # 5. Import rankings
    import_rankings(db, scraped_data['rankings'])
    
    # 6. Import personas
    import_personas(db, scraped_data['personas'])
    
    # 7. Generate visualization data
    generate_visualization_data(db)
    
    return {
        'status': 'success',
        'imported_events': len(scraped_data['pageant_data']),
        'imported_contestants': sum(len(event['contestants']) for event in scraped_data['pageant_data']),
        'imported_profiles': sum(len(event['social_profiles']) for event in scraped_data['pageant_data']),
        'imported_rankings': len(scraped_data['rankings']),
        'imported_personas': len(scraped_data['personas'])
    }