- Add new cvs code to cvs

git-svn-id: svn://svn.coreboot.org/coreboot/trunk@1657 2b7e53f0-3cfb-0310-b3e9-8179ed1497e1
2004-10-14 19:29:29 +00:00
parent 98e619b1ce
commit fcd5ace00b
79 changed files with 8384 additions and 0 deletions
--- a/src/cpu/x86/16bit/entry16.inc
+++ b/src/cpu/x86/16bit/entry16.inc
@@ -0,0 +1,124 @@
+/*
+This software and ancillary information (herein called SOFTWARE )
+called LinuxBIOS          is made available under the terms described
+here.  The SOFTWARE has been approved for release with associated
+LA-CC Number 00-34   .  Unless otherwise indicated, this SOFTWARE has
+been authored by an employee or employees of the University of
+California, operator of the Los Alamos National Laboratory under
+Contract No. W-7405-ENG-36 with the U.S. Department of Energy.  The
+U.S. Government has rights to use, reproduce, and distribute this
+SOFTWARE.  The public may copy, distribute, prepare derivative works
+and publicly display this SOFTWARE without charge, provided that this
+Notice and any statement of authorship are reproduced on all copies.
+Neither the Government nor the University makes any warranty, express
+or implied, or assumes any liability or responsibility for the use of
+this SOFTWARE.  If SOFTWARE is modified to produce derivative works,
+such modified SOFTWARE should be clearly marked, so as not to confuse
+it with the version available from LANL.
+ */
+/* Copyright 2000, Ron Minnich, Advanced Computing Lab, LANL
+ * rminnich@lanl.gov
+ */
+
+
+/** Start code to put an i386 or later processor into 32-bit 
+ * protected mode. 
+ */
+
+/* .section ".rom.text" */
+#include <arch/rom_segs.h>
+.code16
+.globl _start
+.type _start, @function
+
+_start: 
+	cli
+	/* Save the BIST result */
+	movl	%eax, %ebp
+
+/* thanks to kmliu@sis.tw.com for this TBL fix ... */
+/**/
+/* IMMEDIATELY invalidate the translation lookaside buffer before executing*/
+/* any further code.  Even though paging is disabled we could still get*/
+/*false address translations due to the TLB if we didn't invalidate it.*/
+/**/
+	xorl	%eax, %eax
+	movl	%eax, %cr3    /* Invalidate TLB*/
+
+
+	/* Invalidating the cache here seems to be a bad idea on
+	 * modern processors.  Don't.
+	 * If we are hyperthreaded or we have multiple cores it is bad,
+	 * for SMP startup.  On Opterons it causes a 5 second delay.
+	 * Invalidating the cache was pure paranoia in any event.
+	 * If you cpu needs it you can write a cpu dependent version of
+	 * entry16.inc.
+	 */
+
+	/* Note: gas handles memory addresses in 16 bit code very poorly.
+	 * In particular it doesn't appear to have a directive allowing you
+	 * associate a section or even an absolute offset with a segment register.
+	 *
+	 * This means that anything except cs:ip relative offsets are
+	 * a real pain in 16 bit mode.  And explains why it is almost
+	 * imposible to get gas to do lgdt correctly.
+	 *
+	 * One way to work around this is to have the linker do the
+	 * math instead of the assembler.  This solves the very
+	 * pratical problem of being able to write code that can
+	 * be relocated.
+	 *
+	 * An lgdt call before we have memory enabled cannot be 
+	 * position independent, as we cannot execute a call
+	 * instruction to get our current instruction pointer.
+	 * So while this code is relocateable it isn't arbitrarily
+	 * relocatable.
+	 *
+	 * The criteria for relocation have been relaxed to their 
+	 * utmost, so that we can use the same code for both
+	 * our initial entry point and startup of the second cpu.
+	 * The code assumes when executing at _start that:
+	 * (((cs & 0xfff) == 0) and (ip == _start & 0xffff))
+	 * or
+	 * ((cs == anything) and (ip == 0)).
+	 *
+	 * The restrictions in reset16.inc mean that _start initially
+	 * must be loaded at or above 0xffff0000 or below 0x100000.
+	 *
+	 * The linker scripts computs gdtptr16_offset by simply returning
+	 * the low 16 bits.  This means that the intial segment used
+	 * when start is called must be 64K aligned.  This should not
+	 * restrict the address as the ip address can be anything.
+	 */
+
+	movw	%cs, %ax
+	shlw	$4, %ax
+	movw	$gdtptr16_offset, %bx
+	subw	%ax, %bx
+	data32  lgdt %cs:(%bx)
+
+	movl	%cr0, %eax
+	andl	$0x7FFAFFD1, %eax /* PG,AM,WP,NE,TS,EM,MP = 0 */
+	orl	$0x60000001, %eax /* CD, NW, PE = 1 */
+	movl	%eax, %cr0
+
+	/* Restore BIST to %eax */
+	movl	%ebp, %eax
+
+	/* Now that we are in protected mode jump to a 32 bit code segment. */
+	data32	ljmp	$ROM_CODE_SEG, $__protected_start
+
+/** The gdt has a 4 Gb code segment at 0x10, and a 4 GB data segment
+ * at 0x18; these are Linux-compatible. 
+ */
+
+.align	4
+.globl gdtptr16
+gdtptr16:
+	.word	gdt_end - gdt -1 /* compute the table limit */
+	.long	gdt		 /* we know the offset */
+
+.globl _estart
+_estart:
+	.code32
+
--- a/src/cpu/x86/16bit/entry16.lds
+++ b/src/cpu/x86/16bit/entry16.lds
@@ -0,0 +1,2 @@
+	gdtptr16_offset = gdtptr16 & 0xffff;
+	_start_offset = _start & 0xffff;
--- a/src/cpu/x86/16bit/reset16.inc
+++ b/src/cpu/x86/16bit/reset16.inc
@@ -0,0 +1,21 @@
+	.section ".reset"
+	.code16
+.globl	reset_vector
+reset_vector:
+#if _ROMBASE >= 0xffff0000
+	/* jmp _start */
+	.byte  0xe9
+	.int   _start - ( . + 2 )
+	/* Note: The above jump is hand coded to work around bugs in binutils.
+	 * 5 byte are used for a 3 byte instruction.  This works because x86
+	 * is little endian and allows us to use supported 32bit relocations
+	 * instead of the weird 16 bit relocations that binutils does not
+	 * handle consistenly between versions because they are used so rarely.
+	 */
+#else
+# error _ROMBASE is an unsupported value
+#endif
+	. = 0x8;
+	.code32
+	jmp	protected_start
+	.previous
--- a/src/cpu/x86/16bit/reset16.lds
+++ b/src/cpu/x86/16bit/reset16.lds
@@ -0,0 +1,14 @@
+/*
+ *      _ROMTOP                 : The top of the rom used where we
+ *				  need to put the reset vector.
+ */
+
+SECTIONS {
+	_ROMTOP = (_ROMBASE >= 0xffff0000)? 0xfffffff0 : 0xffff0;
+	. = _ROMTOP;
+	.reset . : {
+		*(.reset)
+		. = 15 ;
+		BYTE(0x00);
+	} 
+}
--- a/src/cpu/x86/32bit/entry32.inc
+++ b/src/cpu/x86/32bit/entry32.inc
@@ -0,0 +1,61 @@
+/* For starting linuxBIOS in protected mode */
+
+#include <arch/rom_segs.h>
+
+/* 	.section ".rom.text" */
+	.code32
+
+	.align	4
+.globl gdtptr
+
+gdt:
+gdtptr:
+	.word	gdt_end - gdt -1 /* compute the table limit */
+	.long	gdt		 /* we know the offset */
+	.word	0
+
+/* flat code segment */
+	.word	0xffff, 0x0000		
+	.byte	0x00, 0x9b, 0xcf, 0x00	
+	
+/* flat data segment */
+	.word	0xffff, 0x0000		
+	.byte	0x00, 0x93, 0xcf, 0x00	
+
+gdt_end:
+	
+
+/*
+ *	When we come here we are in protected mode. We expand 
+ *	the stack and copies the data segment from ROM to the
+ *	memory.
+ *
+ *	After that, we call the chipset bootstrap routine that
+ *	does what is left of the chipset initialization. 
+ *
+ *	NOTE aligned to 4 so that we are sure that the prefetch
+ *	cache will be reloaded.
+ */
+	.align	4
+.globl protected_start
+protected_start:
+
+	lgdt	%cs:gdtptr
+	ljmp	$ROM_CODE_SEG, $__protected_start
+	
+__protected_start:
+	/* Save the BIST value */
+	movl	%eax, %ebp
+
+	intel_chip_post_macro(0x10)	/* post 10 */
+
+	movw	$ROM_DATA_SEG, %ax
+	movw	%ax, %ds
+	movw	%ax, %es
+	movw	%ax, %ss
+	movw	%ax, %fs
+	movw	%ax, %gs
+
+	/* Restore the BIST value to %eax */
+	movl	%ebp, %eax
+
--- a/src/cpu/x86/32bit/entry32.lds
+++ b/src/cpu/x86/32bit/entry32.lds
@@ -0,0 +1,14 @@
+/*
+	_cache_ram_seg_base = DEFINED(CACHE_RAM_BASE)? CACHE_RAM_BASE - _rodata : 0;
+	_cache_ram_seg_base_low    = (_cache_ram_seg_base) & 0xffff;
+	_cache_ram_seg_base_middle = (_cache_ram_seg_base >> 16) & 0xff;
+	_cache_ram_seg_base_high   = (_cache_ram_seg_base >> 24) & 0xff;
+
+	_rom_code_seg_base =  _ltext - _text;
+	_rom_code_seg_base_low    = (_rom_code_seg_base) & 0xffff;
+	_rom_code_seg_base_middle = (_rom_code_seg_base >> 16) & 0xff;
+	_rom_code_seg_base_high   = (_rom_code_seg_base >> 24) & 0xff;
+*/
+
+
+
--- a/src/cpu/x86/32bit/reset32.inc
+++ b/src/cpu/x86/32bit/reset32.inc
@@ -0,0 +1,10 @@
+	.section ".reset"
+	.code16
+.globl	reset_vector
+reset_vector:
+
+	. = 0x8;
+	.code32
+	jmp	protected_start
+
+	.previous
--- a/src/cpu/x86/32bit/reset32.lds
+++ b/src/cpu/x86/32bit/reset32.lds
@@ -0,0 +1,14 @@
+/*
+ *      _ROMTOP                 : The top of the rom used where we
+ *				  need to put the reset vector.
+ */
+
+SECTIONS {
+	_ROMTOP = _ROMBASE + ROM_IMAGE_SIZE - 0x10;
+	. = _ROMTOP;
+	.reset (.): {
+		*(.reset)
+		. = 15 ;
+		BYTE(0x00);
+	} 
+}
--- a/src/cpu/x86/cache/Config.lb
+++ b/src/cpu/x86/cache/Config.lb
@@ -0,0 +1 @@
+object cache.o
--- a/src/cpu/x86/cache/cache.c
+++ b/src/cpu/x86/cache/cache.c
@@ -0,0 +1,10 @@
+#include <console/console.h>
+#include <cpu/x86/cache.h>
+
+void x86_enable_cache(void)
+{
+	post_code(0x60);
+	printk_info("Enabling cache\n");
+	enable_cache();
+}
+
--- a/src/cpu/x86/fpu/Config.lb
+++ b/src/cpu/x86/fpu/Config.lb
--- a/src/cpu/x86/fpu/enable_fpu.inc
+++ b/src/cpu/x86/fpu/enable_fpu.inc
@@ -0,0 +1,9 @@
+	/* preserve BIST in %eax */
+	movl %eax, %ebp
+
+	/* Disable floating point emulation */
+	movl	%cr0, %eax
+	andl	$~(1<<2), %eax
+	movl	%eax, %cr0
+
+	movl %ebp, %eax
--- a/src/cpu/x86/lapic/Config.lb
+++ b/src/cpu/x86/lapic/Config.lb
@@ -0,0 +1,3 @@
+object lapic.o
+object lapic_cpu_init.o
+object secondary.S
--- a/src/cpu/x86/lapic/boot_cpu.c
+++ b/src/cpu/x86/lapic/boot_cpu.c
@@ -0,0 +1,10 @@
+#include <cpu/x86/msr.h>
+
+int boot_cpu(void)
+{
+	int bsp;
+	msr_t msr;
+	msr = rdmsr(0x1b);
+	bsp = !!(msr.lo & (1 << 8));
+	return bsp;
+}
--- a/src/cpu/x86/lapic/lapic.c
+++ b/src/cpu/x86/lapic/lapic.c
@@ -0,0 +1,72 @@
+#include <cpu/x86/lapic.h>
+#include <console/console.h>
+#include <cpu/x86/msr.h>
+#include <cpu/x86/mtrr.h>
+
+void setup_lapic(void)
+{
+	/* this is so interrupts work. This is very limited scope -- 
+	 * linux will do better later, we hope ...
+	 */
+	/* this is the first way we learned to do it. It fails on real SMP 
+	 * stuff. So we have to do things differently ... 
+	 * see the Intel mp1.4 spec, page A-3
+	 */
+
+#if NEED_LAPIC == 1
+	/* Only Pentium Pro and later have those MSR stuff */
+	msr_t msr;
+
+	printk_info("Setting up local apic...");
+
+	/* Enable the local apic */
+	msr = rdmsr(LAPIC_BASE_MSR);
+	msr.lo |= LAPIC_BASE_MSR_ENABLE;
+	msr.lo &= ~LAPIC_BASE_MSR_ADDR_MASK;
+	msr.lo |= LAPIC_DEFAULT_BASE;
+	wrmsr(LAPIC_BASE_MSR, msr);
+
+	/*
+	 * Set Task Priority to 'accept all'.
+	 */
+	lapic_write_around(LAPIC_TASKPRI,
+		lapic_read_around(LAPIC_TASKPRI) & ~LAPIC_TPRI_MASK);
+
+	/* Put the local apic in virtual wire mode */
+	lapic_write_around(LAPIC_SPIV, 
+		(lapic_read_around(LAPIC_SPIV) & ~(LAPIC_VECTOR_MASK))
+		| LAPIC_SPIV_ENABLE);
+	lapic_write_around(LAPIC_LVT0, 
+		(lapic_read_around(LAPIC_LVT0) & 
+			~(LAPIC_LVT_MASKED | LAPIC_LVT_LEVEL_TRIGGER | 
+				LAPIC_LVT_REMOTE_IRR | LAPIC_INPUT_POLARITY | 
+				LAPIC_SEND_PENDING |LAPIC_LVT_RESERVED_1 |
+				LAPIC_DELIVERY_MODE_MASK))
+		| (LAPIC_LVT_REMOTE_IRR |LAPIC_SEND_PENDING | 
+			LAPIC_DELIVERY_MODE_EXTINT)
+		);
+	lapic_write_around(LAPIC_LVT1, 
+		(lapic_read_around(LAPIC_LVT1) & 
+			~(LAPIC_LVT_MASKED | LAPIC_LVT_LEVEL_TRIGGER | 
+				LAPIC_LVT_REMOTE_IRR | LAPIC_INPUT_POLARITY | 
+				LAPIC_SEND_PENDING |LAPIC_LVT_RESERVED_1 |
+				LAPIC_DELIVERY_MODE_MASK))
+		| (LAPIC_LVT_REMOTE_IRR |LAPIC_SEND_PENDING | 
+			LAPIC_DELIVERY_MODE_NMI)
+		);
+
+	printk_debug(" apic_id: %d ", lapicid());
+
+#else /* !NEED_LLAPIC */
+	/* Only Pentium Pro and later have those MSR stuff */
+	msr_t msr;
+
+	printk_info("Disabling local apic...");
+
+	msr = rdmsr(LAPIC_BASE_MSR);
+	msr.lo &= ~LAPIC_BASE_MSR_ENABLE;
+	wrmsr(LAPIC_BASE_MSR, msr);
+#endif /* !NEED_LAPIC */
+	printk_info("done.\n");
+	post_code(0x9b);
+}
--- a/src/cpu/x86/lapic/lapic_cpu_init.c
+++ b/src/cpu/x86/lapic/lapic_cpu_init.c
@@ -0,0 +1,316 @@
+#include <cpu/x86/lapic.h>
+#include <delay.h>
+#include <string.h>
+#include <console/console.h>
+#include <arch/hlt.h>
+#include <device/device.h>
+#include <device/path.h>
+#include <smp/atomic.h>
+#include <smp/spinlock.h>
+#include <cpu/cpu.h>
+
+
+#if CONFIG_SMP == 1
+/* This is a lot more paranoid now, since Linux can NOT handle
+ * being told there is a CPU when none exists. So any errors 
+ * will return 0, meaning no CPU. 
+ *
+ * We actually handling that case by noting which cpus startup
+ * and not telling anyone about the ones that dont.
+ */ 
+static int lapic_start_cpu(unsigned long apicid)
+{
+	int timeout;
+	unsigned long send_status, accept_status, start_eip;
+	int j, num_starts, maxlvt;
+	extern char _secondary_start[];
+		
+	/*
+	 * Starting actual IPI sequence...
+	 */
+
+	printk_spew("Asserting INIT.\n");
+
+	/*
+	 * Turn INIT on target chip
+	 */
+	lapic_write_around(LAPIC_ICR2, SET_LAPIC_DEST_FIELD(apicid));
+
+	/*
+	 * Send IPI
+	 */
+	
+	lapic_write_around(LAPIC_ICR, LAPIC_INT_LEVELTRIG | LAPIC_INT_ASSERT
+				| LAPIC_DM_INIT);
+
+	printk_spew("Waiting for send to finish...\n");
+	timeout = 0;
+	do {
+		printk_spew("+");
+		udelay(100);
+		send_status = lapic_read(LAPIC_ICR) & LAPIC_ICR_BUSY;
+	} while (send_status && (timeout++ < 1000));
+	if (timeout >= 1000) {
+		printk_err("CPU %d: First apic write timed out. Disabling\n",
+			 apicid);
+		// too bad. 
+		printk_err("ESR is 0x%x\n", lapic_read(LAPIC_ESR));
+		if (lapic_read(LAPIC_ESR)) {
+			printk_err("Try to reset ESR\n");
+			lapic_write_around(LAPIC_ESR, 0);
+			printk_err("ESR is 0x%x\n", lapic_read(LAPIC_ESR));
+		}
+		return 0;
+	}
+	mdelay(10);
+
+	printk_spew("Deasserting INIT.\n");
+
+	/* Target chip */
+	lapic_write_around(LAPIC_ICR2, SET_LAPIC_DEST_FIELD(apicid));
+
+	/* Send IPI */
+	lapic_write_around(LAPIC_ICR, LAPIC_INT_LEVELTRIG | LAPIC_DM_INIT);
+	
+	printk_spew("Waiting for send to finish...\n");
+	timeout = 0;
+	do {
+		printk_spew("+");
+		udelay(100);
+		send_status = lapic_read(LAPIC_ICR) & LAPIC_ICR_BUSY;
+	} while (send_status && (timeout++ < 1000));
+	if (timeout >= 1000) {
+		printk_err("CPU %d: Second apic write timed out. Disabling\n",
+			 apicid);
+		// too bad. 
+		return 0;
+	}
+
+	start_eip = (unsigned long)_secondary_start;
+	printk_spew("start_eip=0x%08lx\n", start_eip);
+       
+	num_starts = 2;
+
+	/*
+	 * Run STARTUP IPI loop.
+	 */
+	printk_spew("#startup loops: %d.\n", num_starts);
+
+	maxlvt = 4;
+
+	for (j = 1; j <= num_starts; j++) {
+		printk_spew("Sending STARTUP #%d to %u.\n", j, apicid);
+		lapic_read_around(LAPIC_SPIV);
+		lapic_write(LAPIC_ESR, 0);
+		lapic_read(LAPIC_ESR);
+		printk_spew("After apic_write.\n");
+
+		/*
+		 * STARTUP IPI
+		 */
+
+		/* Target chip */
+		lapic_write_around(LAPIC_ICR2, SET_LAPIC_DEST_FIELD(apicid));
+
+		/* Boot on the stack */
+		/* Kick the second */
+		lapic_write_around(LAPIC_ICR, LAPIC_DM_STARTUP
+					| (start_eip >> 12));
+
+		/*
+		 * Give the other CPU some time to accept the IPI.
+		 */
+		udelay(300);
+
+		printk_spew("Startup point 1.\n");
+
+		printk_spew("Waiting for send to finish...\n");
+		timeout = 0;
+		do {
+			printk_spew("+");
+			udelay(100);
+			send_status = lapic_read(LAPIC_ICR) & LAPIC_ICR_BUSY;
+		} while (send_status && (timeout++ < 1000));
+
+		/*
+		 * Give the other CPU some time to accept the IPI.
+		 */
+		udelay(200);
+		/*
+		 * Due to the Pentium erratum 3AP.
+		 */
+		if (maxlvt > 3) {
+			lapic_read_around(LAPIC_SPIV);
+			lapic_write(LAPIC_ESR, 0);
+		}
+		accept_status = (lapic_read(LAPIC_ESR) & 0xEF);
+		if (send_status || accept_status)
+			break;
+	}
+	printk_spew("After Startup.\n");
+	if (send_status)
+		printk_warning("APIC never delivered???\n");
+	if (accept_status)
+		printk_warning("APIC delivery error (%lx).\n", accept_status);
+	if (send_status || accept_status)
+		return 0;
+	return 1;
+}
+
+/* Number of cpus that are currently running in linuxbios */
+static atomic_t active_cpus = ATOMIC_INIT(1);
+
+/* start_cpu_lock covers last_cpu_index and secondary_stack.
+ * Only starting one cpu at a time let's me remove the logic
+ * for select the stack from assembly language.
+ *
+ * In addition communicating by variables to the cpu I
+ * am starting allows me to veryify it has started before
+ * start_cpu returns.
+ */
+
+static spinlock_t start_cpu_lock = SPIN_LOCK_UNLOCKED;
+static unsigned last_cpu_index = 0;
+volatile unsigned long secondary_stack;
+
+int start_cpu(device_t cpu)
+{
+	extern unsigned char _estack[];
+	struct cpu_info *info;
+	unsigned long stack_end;
+	unsigned long apicid;
+	unsigned long index;
+	unsigned long count;
+	int result;
+
+	spin_lock(&start_cpu_lock);
+
+	/* Get the cpu's apicid */
+	apicid = cpu->path.u.apic.apic_id;
+
+	/* Get an index for the new processor */
+	index = ++last_cpu_index;
+	
+	/* Find end of the new processors stack */
+	stack_end = ((unsigned long)_estack) - (STACK_SIZE*index) - sizeof(struct cpu_info);
+	
+	/* Record the index and which cpu structure we are using */
+	info = (struct cpu_info *)stack_end;
+	info->index = index;
+	info->cpu   = cpu;
+
+	/* Advertise the new stack to start_cpu */
+	secondary_stack = stack_end;
+
+	/* Until the cpu starts up report the cpu is not enabled */
+	cpu->enabled = 0;
+	cpu->initialized = 0;
+
+	/* Start the cpu */
+	result = lapic_start_cpu(apicid);
+
+	if (result) {
+		result = 0;
+		/* Wait 1s or until the new the new cpu calls in */
+		for(count = 0; count < 100000 ; count++) {
+			if (secondary_stack == 0) {
+				result = 1;
+				break;
+			}
+			udelay(10);
+		}
+	}
+	secondary_stack = 0;
+	spin_unlock(&start_cpu_lock);
+	return result;
+}
+
+/* C entry point of secondary cpus */
+void secondary_cpu_init(void)
+{
+	atomic_inc(&active_cpus);
+	cpu_initialize();
+	atomic_dec(&active_cpus);
+	stop_this_cpu();
+}
+
+static void initialize_other_cpus(device_t root)
+{
+	int old_active_count, active_count;
+	device_t cpu;
+	/* Loop through the cpus once getting them started */
+	for(cpu = root->link[1].children; cpu ; cpu = cpu->sibling) {
+		if (cpu->path.type != DEVICE_PATH_APIC) {
+			continue;
+		}
+		if (!cpu->enabled) {
+			continue;
+		}
+		if (cpu->initialized) {
+			continue;
+		}
+		if (!start_cpu(cpu)) {
+			/* Record the error in cpu? */
+			printk_err("CPU  %u would not start!\n",
+				cpu->path.u.apic.apic_id);
+		}
+	}
+
+	/* Now loop until the other cpus have finished initializing */
+	old_active_count = 1;
+	active_count = atomic_read(&active_cpus);
+	while(active_count > 1) {
+		if (active_count != old_active_count) {
+			printk_info("Waiting for %d CPUS to stop\n", active_count);
+			old_active_count = active_count;
+		}
+		udelay(10);
+		active_count = atomic_read(&active_cpus);
+	}
+	for(cpu = root->link[1].children; cpu; cpu = cpu->sibling) {
+		if (cpu->path.type != DEVICE_PATH_APIC) {
+			continue;
+		}
+		if (!cpu->initialized) {
+			printk_err("CPU %u did not initialize!\n", 
+				cpu->path.u.apic.apic_id);
+#warning "FIXME do I need a mainboard_cpu_fixup function?"
+		}
+	}
+	printk_debug("All AP CPUs stopped\n");
+}
+
+#else /* CONFIG_SMP */
+#define initialize_other_cpus(root) do {} while(0)
+#endif /* CONFIG_SMP */
+
+void initialize_cpus(device_t root)
+{
+	struct device_path cpu_path;
+	struct cpu_info *info;
+
+	/* Find the info struct for this cpu */
+	info = cpu_info();
+
+#if NEED_LAPIC == 1
+	/* Ensure the local apic is enabled */
+	enable_lapic();
+
+	/* Get the device path of the boot cpu */
+	cpu_path.type           = DEVICE_PATH_APIC;
+	cpu_path.u.apic.apic_id = lapicid();
+#else
+	/* Get the device path of the boot cpu */
+	cpu_path.type           = DEVICE_PATH_BOOT_CPU;
+#endif
+	
+	/* Find the device structure for the boot cpu */
+	info->cpu = alloc_find_dev(&root->link[1], &cpu_path);
+	
+	/* Initialize the bootstrap processor */
+	cpu_initialize();
+
+	/* Now initialize the rest of the cpus */
+	initialize_other_cpus(root);
+}
+
--- a/src/cpu/x86/lapic/secondary.S
+++ b/src/cpu/x86/lapic/secondary.S
@@ -0,0 +1,53 @@
+#include <arch/asm.h>
+#include <arch/intel.h>
+#include <cpu/x86/mtrr.h>
+#include <cpu/x86/lapic_def.h>
+	.text
+	.globl _secondary_start
+	.balign 4096
+_secondary_start:
+	.code16
+	cli
+	xorl	%eax, %eax
+	movl	%eax, %cr3    /* Invalidate TLB*/
+
+	/* On hyper threaded cpus, invalidating the cache here is
+	 * very very bad.  Don't.
+	 */
+
+	/* setup the data segment */
+	movw	%cs, %ax
+	movw	%ax, %ds
+
+	data32	lgdt	gdtaddr  - _secondary_start
+
+	movl	%cr0, %eax
+	andl	$0x7FFAFFD1, %eax /* PG,AM,WP,NE,TS,EM,MP = 0 */
+	orl	$0x60000001, %eax /* CD, NW, PE = 1 */
+	movl	%eax, %cr0
+
+	ljmpl $0x10, $1f
+1:	
+	.code32
+	movw	$0x18, %ax
+	movw	%ax, %ds
+	movw	%ax, %es
+	movw	%ax, %ss
+	movw	%ax, %fs
+	movw	%ax, %gs
+
+	/* Set the stack pointer, and flag that we are done */
+	xorl	%eax, %eax
+	movl	secondary_stack, %esp
+	movl	%eax, secondary_stack
+
+	call	secondary_cpu_init
+1:	hlt
+	jmp	1b
+
+gdtaddr:
+	.word	gdt_limit	/* the table limit */
+	.long	gdt		/* we know the offset */
+
+
+.code32
--- a/src/cpu/x86/mmx/Config.lb
+++ b/src/cpu/x86/mmx/Config.lb
--- a/src/cpu/x86/mmx/disable_mmx.inc
+++ b/src/cpu/x86/mmx/disable_mmx.inc
@@ -0,0 +1,2 @@
+	/* Clear out an mmx state */
+	emms
--- a/src/cpu/x86/mmx/enable_mmx.inc
+++ b/src/cpu/x86/mmx/enable_mmx.inc
@@ -0,0 +1,6 @@
+	/* BIST in %eax */
+
+	/*
+	 * Enabling mmx registers is a noop
+	 */
+
--- a/src/cpu/x86/mtrr/Config.lb
+++ b/src/cpu/x86/mtrr/Config.lb
@@ -0,0 +1 @@
+object mtrr.o
--- a/src/cpu/x86/mtrr/earlymtrr.c
+++ b/src/cpu/x86/mtrr/earlymtrr.c
@@ -0,0 +1,123 @@
+#ifndef EARLYMTRR_C
+#define EARLYMTRR_C
+#include <cpu/x86/cache.h>
+#include <cpu/x86/mtrr.h>
+#include <cpu/x86/msr.h>
+
+/* Validate XIP_ROM_SIZE and XIP_ROM_BASE */
+#if defined(XIP_ROM_SIZE) && !defined(XIP_ROM_BASE)
+#error "XIP_ROM_SIZE without XIP_ROM_BASE"
+#endif
+#if defined(XIP_ROM_BASE) && !defined(XIP_ROM_SIZE)
+#error "XIP_ROM_BASE without XIP_ROM_SIZE"
+#endif
+#if !defined(CONFIG_LB_MEM_TOPK)
+#error "CONFIG_LB_MEM_TOPK not defined"
+#endif
+
+#if defined(XIP_ROM_SIZE) && ((XIP_ROM_SIZE & (XIP_ROM_SIZE -1)) != 0)
+#error "XIP_ROM_SIZE is not a power of 2"
+#endif
+#if defined(XIP_ROM_SIZE) && ((XIP_ROM_BASE % XIP_ROM_SIZE) != 0)
+#error "XIP_ROM_BASE is not a multiple of XIP_ROM_SIZE"
+#endif
+
+#if (CONFIG_LB_MEM_TOPK & (CONFIG_LB_MEM_TOPK -1)) != 0
+# error "CONFIG_LB_MEM_TOPK must be a power of 2"
+#endif
+
+static void disable_var_mtrr(unsigned reg)
+{
+	/* The invalid bit is kept in the mask so we simply
+	 * clear the relevent mask register to disable a
+	 * range.
+	 */
+	msr_t zero;
+	zero.lo = zero.hi = 0;
+	wrmsr(MTRRphysMask_MSR(reg), zero);
+}
+
+static void set_var_mtrr(
+	unsigned reg, unsigned base, unsigned size, unsigned type)
+
+{
+	/* Bit Bit 32-35 of MTRRphysMask should be set to 1 */
+	msr_t basem, maskm;
+	basem.lo = base | type;
+	basem.hi = 0;
+	wrmsr(MTRRphysBase_MSR(reg), basem);
+	maskm.lo = ~(size - 1) | 0x800;
+	maskm.hi = 0x0f;
+	wrmsr(MTRRphysMask_MSR(reg), maskm);
+}
+
+static void cache_lbmem(int type)
+{
+	/* Enable caching for 0 - 1MB using variable mtrr */
+	disable_cache();
+	set_var_mtrr(0, 0x00000000, CONFIG_LB_MEM_TOPK << 10, type);
+	enable_cache();
+}
+
+
+/* the fixed and variable MTTRs are power-up with random values,
+ * clear them to MTRR_TYPE_UNCACHEABLE for safty.
+ */
+static void do_early_mtrr_init(const unsigned long *mtrr_msrs)
+{
+	/* Precondition:
+	 *   The cache is not enabled in cr0 nor in MTRRdefType_MSR
+	 *   entry32.inc ensures the cache is not enabled in cr0
+	 */
+	msr_t msr;
+	const unsigned long *msr_addr;
+	unsigned long cr0;
+
+	print_spew("Clearing mtrr\r\n");
+
+	/* Inialize all of the relevant msrs to 0 */
+	msr.lo = 0;
+	msr.hi = 0;
+	unsigned long msr_nr;
+	for(msr_addr = mtrr_msrs; (msr_nr = *msr_addr); msr_addr++) {
+		wrmsr(msr_nr, msr);
+	}
+
+#if defined(XIP_ROM_SIZE)
+	/* enable write through caching so we can do execute in place
+	 * on the flash rom.
+	 */
+	set_var_mtrr(1, XIP_ROM_BASE, XIP_ROM_SIZE, MTRR_TYPE_WRBACK);
+#endif
+
+	/* Set the default memory type and enable fixed and variable MTRRs 
+	 */
+	/* Enable Variable MTRRs */
+	msr.hi = 0x00000000;
+	msr.lo = 0x00000800;
+	wrmsr(MTRRdefType_MSR, msr);
+	
+}
+
+static void early_mtrr_init(void)
+{
+	static const unsigned long mtrr_msrs[] = {
+		/* fixed mtrr */
+		0x250, 0x258, 0x259,
+		0x268, 0x269, 0x26A,
+		0x26B, 0x26C, 0x26D,
+		0x26E, 0x26F,
+		/* var mtrr */
+		0x200, 0x201, 0x202, 0x203,
+		0x204, 0x205, 0x206, 0x207,
+		0x208, 0x209, 0x20A, 0x20B,
+		0x20C, 0x20D, 0x20E, 0x20F,
+		/* NULL end of table */
+		0
+	};
+	disable_cache();
+	do_early_mtrr_init(mtrr_msrs);
+	enable_cache();
+}
+
+#endif /* EARLYMTRR_C */
--- a/src/cpu/x86/mtrr/mtrr.c
+++ b/src/cpu/x86/mtrr/mtrr.c
@@ -0,0 +1,378 @@
+/*
+ * intel_mtrr.c: setting MTRR to decent values for cache initialization on P6
+ *
+ * Derived from intel_set_mtrr in intel_subr.c and mtrr.c in linux kernel
+ *
+ * Copyright 2000 Silicon Integrated System Corporation
+ *
+ *	This program is free software; you can redistribute it and/or modify
+ *	it under the terms of the GNU General Public License as published by
+ *	the Free Software Foundation; either version 2 of the License, or
+ *	(at your option) any later version.
+ *
+ *	This program is distributed in the hope that it will be useful,
+ *	but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *	MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *	GNU General Public License for more details.
+ *
+ *	You should have received a copy of the GNU General Public License
+ *	along with this program; if not, write to the Free Software
+ *	Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ *
+ *
+ * Reference: Intel Architecture Software Developer's Manual, Volume 3: System Programming
+ */
+
+#include <console/console.h>
+#include <device/device.h>
+#include <cpu/x86/msr.h>
+#include <cpu/x86/mtrr.h>
+#include <cpu/x86/cache.h>
+
+#define arraysize(x)   (sizeof(x)/sizeof((x)[0]))
+
+#warning "FIXME I do not properly handle address more than 36 physical address bits"
+#ifdef k8
+# define ADDRESS_BITS 40
+#else
+# define ADDRESS_BITS 36
+#endif
+#define ADDRESS_BITS_HIGH (ADDRESS_BITS - 32)
+#define ADDRESS_MASK_HIGH ((1u << ADDRESS_BITS_HIGH) - 1)
+
+static unsigned int mtrr_msr[] = {
+	MTRRfix64K_00000_MSR, MTRRfix16K_80000_MSR, MTRRfix16K_A0000_MSR,
+	MTRRfix4K_C0000_MSR, MTRRfix4K_C8000_MSR, MTRRfix4K_D0000_MSR, MTRRfix4K_D8000_MSR,
+	MTRRfix4K_E0000_MSR, MTRRfix4K_E8000_MSR, MTRRfix4K_F0000_MSR, MTRRfix4K_F8000_MSR,
+};
+
+
+static void enable_fixed_mtrr(void)
+{
+	msr_t msr;
+
+	msr = rdmsr(MTRRdefType_MSR);
+	msr.lo |= 0xc00;
+	wrmsr(MTRRdefType_MSR, msr);
+}
+
+static void enable_var_mtrr(void)
+{
+	msr_t msr;
+
+	msr = rdmsr(MTRRdefType_MSR);
+	msr.lo |= 0x800;
+	wrmsr(MTRRdefType_MSR, msr);
+}
+
+/* setting variable mtrr, comes from linux kernel source */
+static void set_var_mtrr(unsigned int reg, unsigned long basek, unsigned long sizek, unsigned char type)
+{
+	msr_t base, mask;
+
+	base.hi = basek >> 22;
+	base.lo  = basek << 10;
+
+       //printk_debug("ADDRESS_MASK_HIGH=%#x\n", ADDRESS_MASK_HIGH);
+
+	if (sizek < 4*1024*1024) {
+		mask.hi = ADDRESS_MASK_HIGH;
+		mask.lo = ~((sizek << 10) -1);
+	}
+	else {
+		mask.hi = ADDRESS_MASK_HIGH & (~((sizek >> 22) -1));
+		mask.lo = 0;
+	}
+
+	if (reg >= 8)
+		return;
+
+	// it is recommended that we disable and enable cache when we 
+	// do this. 
+	disable_cache();
+	if (sizek == 0) {
+		msr_t zero;
+		zero.lo = zero.hi = 0;
+		/* The invalid bit is kept in the mask, so we simply clear the
+		   relevant mask register to disable a range. */
+		wrmsr (MTRRphysMask_MSR(reg), zero);
+	} else {
+		/* Bit 32-35 of MTRRphysMask should be set to 1 */
+		base.lo |= type;
+		mask.lo |= 0x800;
+		wrmsr (MTRRphysBase_MSR(reg), base);
+		wrmsr (MTRRphysMask_MSR(reg), mask);
+	}
+	enable_cache();
+}
+
+/* fms: find most sigificant bit set, stolen from Linux Kernel Source. */
+static inline unsigned int fms(unsigned int x)
+{
+	int r;
+
+	__asm__("bsrl %1,%0\n\t"
+	        "jnz 1f\n\t"
+	        "movl $0,%0\n"
+	        "1:" : "=r" (r) : "g" (x));
+	return r;
+}
+
+/* fms: find least sigificant bit set */
+static inline unsigned int fls(unsigned int x)
+{
+	int r;
+
+	__asm__("bsfl %1,%0\n\t"
+	        "jnz 1f\n\t"
+	        "movl $32,%0\n"
+	        "1:" : "=r" (r) : "g" (x));
+	return r;
+}
+
+/* setting up variable and fixed mtrr
+ *
+ * From Intel Vol. III Section 9.12.4, the Range Size and Base Alignment has some kind of requirement:
+ *	1. The range size must be 2^N byte for N >= 12 (i.e 4KB minimum).
+ *	2. The base address must be 2^N aligned, where the N here is equal to the N in previous
+ *	   requirement. So a 8K range must be 8K aligned not 4K aligned.
+ *
+ * These requirement is meet by "decompositing" the ramsize into Sum(Cn * 2^n, n = [0..N], Cn = [0, 1]).
+ * For Cm = 1, there is a WB range of 2^m size at base address Sum(Cm * 2^m, m = [N..n]).
+ * A 124MB (128MB - 4MB SMA) example:
+ * 	ramsize = 124MB == 64MB (at 0MB) + 32MB (at 64MB) + 16MB (at 96MB ) + 8MB (at 112MB) + 4MB (120MB).
+ * But this wastes a lot of MTRR registers so we use another more "aggresive" way with Uncacheable Regions.
+ *
+ * In the Uncacheable Region scheme, we try to cover the whole ramsize by one WB region as possible,
+ * If (an only if) this can not be done we will try to decomposite the ramesize, the mathematical formula
+ * whould be ramsize = Sum(Cn * 2^n, n = [0..N], Cn = [-1, 0, 1]). For Cn = -1, a Uncachable Region is used.
+ * The same 124MB example:
+ * 	ramsize = 124MB == 128MB WB (at 0MB) + 4MB UC (at 124MB)
+ * or a 156MB (128MB + 32MB - 4MB SMA) example:
+ *	ramsize = 156MB == 128MB WB (at 0MB) + 32MB WB (at 128MB) + 4MB UC (at 156MB)
+ */
+/* 2 MTRRS are reserved for the operating system */
+#if 0
+#define BIOS_MTRRS 6
+#define OS_MTRRS   2
+#else
+#define BIOS_MTRRS 8
+#define OS_MTRRS   0
+#endif
+#define MTRRS        (BIOS_MTRRS + OS_MTRRS)
+
+
+static void set_fixed_mtrrs(unsigned int first, unsigned int last, unsigned char type)
+{
+	unsigned int i;
+	unsigned int fixed_msr = NUM_FIXED_RANGES >> 3;
+	msr_t msr;
+	msr.lo = msr.hi = 0; /* Shut up gcc */
+	for(i = first; i < last; i++) {
+		/* When I switch to a new msr read it in */
+		if (fixed_msr != i >> 3) {
+			/* But first write out the old msr */
+			if (fixed_msr < (NUM_FIXED_RANGES >> 3)) {
+				disable_cache();
+				wrmsr(mtrr_msr[fixed_msr], msr);
+				enable_cache();
+			}
+			fixed_msr = i>>3;
+			msr = rdmsr(mtrr_msr[fixed_msr]);
+		}
+		if ((i & 7) < 4) {
+			msr.lo &= ~(0xff << ((i&3)*8));
+			msr.lo |= type << ((i&3)*8);
+		} else {
+			msr.hi &= ~(0xff << ((i&3)*8));
+			msr.hi |= type << ((i&3)*8);
+		}
+	}
+	/* Write out the final msr */
+	if (fixed_msr < (NUM_FIXED_RANGES >> 3)) {
+		disable_cache();
+		wrmsr(mtrr_msr[fixed_msr], msr);
+		enable_cache();
+	}
+}
+
+static unsigned fixed_mtrr_index(unsigned long addrk)
+{
+	unsigned index;
+	index = (addrk - 0) >> 6;
+	if (index >= 8) {
+		index = ((addrk - 8*64) >> 4) + 8;
+	}
+	if (index >= 24) {
+		index = ((addrk - (8*64 + 16*16)) >> 2) + 24;
+	}
+	if (index > NUM_FIXED_RANGES) {
+		index = NUM_FIXED_RANGES;
+	}
+	return index;
+}
+
+static unsigned int range_to_mtrr(unsigned int reg, 
+	unsigned long range_startk, unsigned long range_sizek,
+	unsigned long next_range_startk)
+{
+	if (!range_sizek || (reg >= BIOS_MTRRS)) {
+		return reg;
+	}
+	while(range_sizek) {
+		unsigned long max_align, align;
+		unsigned long sizek;
+		/* Compute the maximum size I can make a range */
+		max_align = fls(range_startk);
+		align = fms(range_sizek); 
+		if (align > max_align) {
+			align = max_align;
+		}
+		sizek = 1 << align;
+		printk_debug("Setting variable MTRR %d, base: %4dMB, range: %4dMB, type WB\n",
+			reg, range_startk >>10, sizek >> 10);
+		set_var_mtrr(reg++, range_startk, sizek, MTRR_TYPE_WRBACK);
+		range_startk += sizek;
+		range_sizek -= sizek;
+		if (reg >= BIOS_MTRRS)
+			break;
+	}
+	return reg;
+}
+
+static unsigned long resk(uint64_t value) 
+{
+	unsigned long resultk;
+	if (value < (1ULL << 42)) {
+		resultk = value >> 10;
+	}
+	else {
+		resultk = 0xffffffff;
+	}
+	return resultk;
+}
+
+void x86_setup_mtrrs(void)
+{
+	/* Try this the simple way of incrementally adding together
+	 * mtrrs.  If this doesn't work out we can get smart again 
+	 * and clear out the mtrrs.
+	 */
+	struct device *dev;
+	unsigned long range_startk, range_sizek;
+	unsigned int reg;
+
+	printk_debug("\n");
+	/* Initialized the fixed_mtrrs to uncached */
+	printk_debug("Setting fixed MTRRs(%d-%d) type: UC\n", 
+		0, NUM_FIXED_RANGES);
+	set_fixed_mtrrs(0, NUM_FIXED_RANGES, MTRR_TYPE_UNCACHEABLE);
+
+	/* Now see which of the fixed mtrrs cover ram.
+	 */
+	for(dev = all_devices; dev; dev = dev->next) {
+		struct resource *res, *last;
+		last = &dev->resource[dev->resources];
+		for(res = &dev->resource[0]; res < last; res++) {
+			unsigned int start_mtrr;
+			unsigned int last_mtrr;
+			if (!(res->flags & IORESOURCE_MEM) || 
+				!(res->flags & IORESOURCE_CACHEABLE)) 
+			{
+				continue;
+			}
+			start_mtrr = fixed_mtrr_index(resk(res->base));
+			last_mtrr  = fixed_mtrr_index(resk((res->base + res->size)));
+			if (start_mtrr >= NUM_FIXED_RANGES) {
+				break;
+			}
+			printk_debug("Setting fixed MTRRs(%d-%d) Type: WB\n",
+				start_mtrr, last_mtrr);
+			set_fixed_mtrrs(start_mtrr, last_mtrr, MTRR_TYPE_WRBACK);
+		}
+	}
+	printk_debug("DONE fixed MTRRs\n");
+	/* Cache as many memory areas as possible */
+	/* FIXME is there an algorithm for computing the optimal set of mtrrs? 
+	 * In some cases it is definitely possible to do better.
+	 */
+	range_startk = 0;
+	range_sizek = 0;
+	reg = 0;
+	for(dev = all_devices; dev; dev = dev->next) {
+		struct resource *res, *last;
+		last = &dev->resource[dev->resources];
+		for(res = &dev->resource[0]; res < last; res++) {
+			unsigned long basek, sizek;
+			if (!(res->flags & IORESOURCE_MEM) ||
+				!(res->flags & IORESOURCE_CACHEABLE)) {
+				continue;
+			}
+			basek = resk(res->base);
+			sizek = resk(res->size);
+			/* See if I can merge with the last range
+			 * Either I am below 1M and the fixed mtrrs handle it, or
+			 * the ranges touch.
+			 */
+			if ((basek <= 1024) || (range_startk + range_sizek == basek)) {
+				unsigned long endk = basek + sizek;
+				range_sizek = endk - range_startk;
+				continue;
+			}
+			/* Write the range mtrrs */
+			if (range_sizek != 0) {
+				reg = range_to_mtrr(reg, range_startk, range_sizek, basek);
+				range_startk = 0;
+				range_sizek = 0;
+				if (reg >= BIOS_MTRRS) 
+					goto last_msr;
+			}
+			/* Allocate an msr */
+			range_startk = basek;
+			range_sizek  = sizek;
+		}
+	}
+ last_msr:
+	/* Write the last range */
+	reg = range_to_mtrr(reg, range_startk, range_sizek, 0);
+	printk_debug("DONE variable MTRRs\n");
+	printk_debug("Clear out the extra MTRR's\n");
+	/* Clear out the extra MTRR's */
+	while(reg < MTRRS) {
+		set_var_mtrr(reg++, 0, 0, 0);
+	}
+	/* enable fixed MTRR */
+	printk_spew("call enable_fixed_mtrr()\n");
+	enable_fixed_mtrr();
+	printk_spew("call enable_var_mtrr()\n");
+	enable_var_mtrr();
+	printk_spew("Leave %s\n", __FUNCTION__);
+	post_code(0x6A);
+}
+
+int x86_mtrr_check(void)
+{
+	/* Only Pentium Pro and later have MTRR */
+	msr_t msr;
+	printk_debug("\nMTRR check\n");
+
+	msr = rdmsr(0x2ff);
+	msr.lo >>= 10;
+
+	printk_debug("Fixed MTRRs   : ");
+	if (msr.lo & 0x01)
+		printk_debug("Enabled\n");
+	else
+		printk_debug("Disabled\n");
+
+	printk_debug("Variable MTRRs: ");
+	if (msr.lo & 0x02)
+		printk_debug("Enabled\n");
+	else
+		printk_debug("Disabled\n");
+
+	printk_debug("\n");
+
+	post_code(0x93);
+	return ((int) msr.lo);
+}
--- a/src/cpu/x86/pae/Config.lb
+++ b/src/cpu/x86/pae/Config.lb
@@ -0,0 +1 @@
+object pgtbl.o
--- a/src/cpu/x86/pae/pgtbl.c
+++ b/src/cpu/x86/pae/pgtbl.c
@@ -0,0 +1,94 @@
+#include <console/console.h>
+#include <cpu/cpu.h>
+#include <cpu/x86/pae.h>
+#include <string.h>
+
+static void paging_off(void)
+{
+	__asm__ __volatile__ (
+		/* Disable paging */
+		"movl	%%cr0, %%eax\n\t"
+		"andl	$0x7FFFFFFF, %%eax\n\t"
+		"movl	%%eax, %%cr0\n\t"
+		/* Disable pae */
+		"movl	%%cr4, %%eax\n\t"
+		"andl	$0xFFFFFFDF, %%eax\n\t"
+		:
+		:
+		: "eax"
+		);
+}
+
+static void paging_on(void *pdp)
+{
+	__asm__ __volatile__(
+		/* Load the page table address */
+		"movl	%0, %%cr3\n\t"
+		/* Enable pae */
+		"movl	%%cr4, %%eax\n\t"
+		"orl	$0x00000020, %%eax\n\t"
+		"movl	%%eax, %%cr4\n\t"
+		/* Enable paging */
+		"movl	%%cr0, %%eax\n\t"
+		"orl	$0x80000000, %%eax\n\t"
+		"movl	%%eax, %%cr0\n\t"
+		:
+		: "r" (pdp)
+		: "eax"
+		);
+}
+
+void *map_2M_page(unsigned long page) 
+{
+	struct pde {
+		uint32_t addr_lo;
+		uint32_t addr_hi;
+	} __attribute__ ((packed));
+	struct pg_table {
+		struct pde pd[2048];
+		struct pde pdp[512];
+	} __attribute__ ((packed));
+	static struct pg_table pgtbl[CONFIG_MAX_CPUS] __attribute__ ((aligned(4096)));
+	static unsigned long mapped_window[CONFIG_MAX_CPUS];
+	unsigned long index;
+	unsigned long window;
+	void *result;
+	int i;
+	index = cpu_index();
+	if ((index < 0) || (index >= CONFIG_MAX_CPUS)) {
+		return MAPPING_ERROR;
+	}
+	window = page >> 10;
+	if (window != mapped_window[index]) {
+		paging_off();
+		if (window > 1) {
+			struct pde *pd, *pdp;
+			/* Point the page directory pointers at the page directories */
+			memset(&pgtbl[index].pdp, 0, sizeof(pgtbl[index].pdp));
+			pd = pgtbl[index].pd;
+			pdp = pgtbl[index].pdp;
+			pdp[0].addr_lo = ((uint32_t)&pd[512*0])|1;
+			pdp[1].addr_lo = ((uint32_t)&pd[512*1])|1;
+			pdp[2].addr_lo = ((uint32_t)&pd[512*2])|1;
+			pdp[3].addr_lo = ((uint32_t)&pd[512*3])|1;
+			/* The first half of the page table is identity mapped */
+			for(i = 0; i < 1024; i++) {
+				pd[i].addr_lo = ((i & 0x3ff) << 21)| 0xE3;
+				pd[i].addr_hi = 0;
+			}
+			/* The second half of the page table holds the mapped page */
+			for(i = 1024; i < 2048; i++) {
+				pd[i].addr_lo = ((window & 1) << 31) | ((i & 0x3ff) << 21) | 0xE3;
+				pd[i].addr_hi = (window >> 1);
+			}
+			paging_on(pdp);
+		}
+		mapped_window[index] = window;
+	}
+	if (window == 0) {
+		result = (void *)(page << 21);
+	} else {
+		result = (void *)(0x80000000 | ((page & 0x3ff) << 21));
+	}
+	return result;
+}
--- a/src/cpu/x86/sse/Config.lb
+++ b/src/cpu/x86/sse/Config.lb
--- a/src/cpu/x86/sse/disable_sse.inc
+++ b/src/cpu/x86/sse/disable_sse.inc
@@ -0,0 +1,18 @@
+	/*
+	 * Put the processor back into a reset state
+	 * with respect to the xmm registers.
+	 */
+	
+	xorps %xmm0, %xmm0
+	xorps %xmm1, %xmm1
+	xorps %xmm2, %xmm2
+	xorps %xmm3, %xmm3
+	xorps %xmm4, %xmm4
+	xorps %xmm5, %xmm5
+	xorps %xmm6, %xmm6
+	xorps %xmm7, %xmm7
+
+	/* Disable sse instructions */
+	movl	%cr4, %eax
+	andl	$~(3<<9), %eax
+	movl	%eax, %cr4
--- a/src/cpu/x86/sse/enable_sse.inc
+++ b/src/cpu/x86/sse/enable_sse.inc
@@ -0,0 +1,14 @@
+	/* preserve BIST in %eax */
+	movl %eax, %ebp
+
+	/*
+	 * Enable the use of the xmm registers
+	 */
+
+	/* Enable sse instructions */
+	movl	%cr4, %eax
+	orl	$(1<<9), %eax
+	movl	%eax, %cr4
+
+	movl	%ebp, %eax
+
--- a/src/cpu/x86/tsc/Config.lb
+++ b/src/cpu/x86/tsc/Config.lb
@@ -0,0 +1,5 @@
+uses CONFIG_UDELAY_TSC
+uses CONFIG_TSC_X86RDTSC_CALIBRATE_WITH_TIMER2
+
+default CONFIG_TSC_X86RDTSC_CALIBRATE_WITH_TIMER2=0
+if CONFIG_UDELAY_TSC object delay_tsc.o  end
--- a/src/cpu/x86/tsc/delay_tsc.c
+++ b/src/cpu/x86/tsc/delay_tsc.c
@@ -0,0 +1,165 @@
+#include <console/console.h>
+#include <arch/io.h>
+#include <cpu/x86/msr.h>
+#include <cpu/x86/tsc.h>
+#include <smp/spinlock.h>
+#include <delay.h>
+
+static unsigned long clocks_per_usec;
+
+#if (CONFIG_TSC_X86RDTSC_CALIBRATE_WITH_TIMER2 == 1)
+#define CLOCK_TICK_RATE	1193180U /* Underlying HZ */
+
+/* ------ Calibrate the TSC ------- 
+ * Too much 64-bit arithmetic here to do this cleanly in C, and for
+ * accuracy's sake we want to keep the overhead on the CTC speaker (channel 2)
+ * output busy loop as low as possible. We avoid reading the CTC registers
+ * directly because of the awkward 8-bit access mechanism of the 82C54
+ * device.
+ */
+
+#define CALIBRATE_INTERVAL ((20*CLOCK_TICK_RATE)/1000) /* 20ms */
+#define CALIBRATE_DIVISOR  (20*1000) /* 20ms / 20000 == 1usec */
+
+static unsigned long long calibrate_tsc(void)
+{
+	/* Set the Gate high, disable speaker */
+	outb((inb(0x61) & ~0x02) | 0x01, 0x61);
+
+	/*
+	 * Now let's take care of CTC channel 2
+	 *
+	 * Set the Gate high, program CTC channel 2 for mode 0,
+	 * (interrupt on terminal count mode), binary count,
+	 * load 5 * LATCH count, (LSB and MSB) to begin countdown.
+	 */
+	outb(0xb0, 0x43);			/* binary, mode 0, LSB/MSB, Ch 2 */
+	outb(CALIBRATE_INTERVAL	& 0xff, 0x42);	/* LSB of count */
+	outb(CALIBRATE_INTERVAL	>> 8, 0x42);	/* MSB of count */
+
+	{
+		tsc_t start;
+		tsc_t end;
+		unsigned long count;
+
+		start = rdtsc();
+		count = 0;
+		do {
+			count++;
+		} while ((inb(0x61) & 0x20) == 0);
+		end = rdtsc();
+
+		/* Error: ECTCNEVERSET */
+		if (count <= 1)
+			goto bad_ctc;
+
+		/* 64-bit subtract - gcc just messes up with long longs */
+		__asm__("subl %2,%0\n\t"
+			"sbbl %3,%1"
+			:"=a" (end.lo), "=d" (end.hi)
+			:"g" (start.lo), "g" (start.hi),
+			 "0" (end.lo), "1" (end.hi));
+
+		/* Error: ECPUTOOFAST */
+		if (end.hi)
+			goto bad_ctc;
+
+
+		/* Error: ECPUTOOSLOW */
+		if (end.lo <= CALIBRATE_DIVISOR)
+			goto bad_ctc;
+
+		return (end.lo + CALIBRATE_DIVISOR -1)/CALIBRATE_DIVISOR;
+	}
+
+	/*
+	 * The CTC wasn't reliable: we got a hit on the very first read,
+	 * or the CPU was so fast/slow that the quotient wouldn't fit in
+	 * 32 bits..
+	 */
+bad_ctc:
+	printk_err("bad_ctc\n");
+	return 0;
+}
+
+#else /*  CONFIG_TSC_X86RDTSC_CALIBRATE_WITH_TIMER2 */
+
+/*
+ * this is the "no timer2" version.
+ * to calibrate tsc, we get a TSC reading, then do 1,000,000 outbs to port 0x80
+ * then we read TSC again, and divide the difference by 1,000,000
+ * we have found on a wide range of machines that this gives us a a 
+ * good microsecond value
+ * to +- 10%. On a dual AMD 1.6 Ghz box, it gives us .97 microseconds, and on a
+ * 267 Mhz. p5, it gives us 1.1 microseconds.
+ * also, since gcc now supports long long, we use that.
+ * also no unsigned long long / operator, so we play games.
+ * about the only thing you can do with long longs, it seems, 
+ *is return them and assign them.
+ * (and do asm on them, yuck)
+ * so avoid all ops on long longs.
+ */
+static unsigned long long calibrate_tsc(void)
+{
+	unsigned long long start, end, delta;
+	unsigned long allones = (unsigned long) -1, result;
+	unsigned long count;
+	
+	start = rdtscll();
+	// no udivdi3, dammit.
+	// so we count to 1<< 20 and then right shift 20
+	for(count = 0; count < (1<<20); count ++)
+		outb(0x80, 0x80);
+	end = rdtscll();
+
+#if 0
+	// make delta be (endhigh - starthigh) + (endlow - startlow)
+	// but >> 20
+	// do it this way to avoid gcc warnings.
+	start = tsc_start.hi;
+	start <<= 32;
+	start |= start.lo;
+	end = tsc_end.hi;
+	end <<= 32;
+	end |= tsc_end.lo;
+#endif
+	delta = end - start;
+	// at this point we have a delta for 1,000,000 outbs. Now rescale for one microsecond.
+	delta >>= 20;
+	// save this for microsecond timing.
+	result = delta;
+	printk_spew("end %x:%x, start %x:%x\n",
+		endhigh, endlow, starthigh, startlow);
+	printk_spew("32-bit delta %d\n", (unsigned long) delta);
+	
+	printk_spew(__FUNCTION__ " 32-bit result is %d\n", result);
+	return delta;
+}
+
+
+#endif /* CONFIG_TSC_X86RDTSC_CALIBRATE_WITH_TIMER2*/
+
+void init_timer(void)
+{
+	if (!clocks_per_usec) {
+		clocks_per_usec = calibrate_tsc();
+		printk_info("clocks_per_usec: %u\n", clocks_per_usec);
+	}
+}
+
+void udelay(unsigned us)
+{
+        unsigned long long count;
+        unsigned long long stop;
+        unsigned long long clocks;
+
+	init_timer();
+	clocks = us;
+	clocks *= clocks_per_usec;
+        count = rdtscll();
+        stop = clocks + count;
+        while(stop > count) {
+		cpu_relax();
+		count = rdtscll();
+        }
+}