diff --git a/Cargo.toml b/Cargo.toml
index 11307468..3fae47eb 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -15,13 +15,14 @@ libc = "0.2.14"
 optional = true
 version = "1.0.0"
 
-[dev-dependencies]
+[target.'cfg(any(target_arch = "x86", target_arch = "x86_64", target_arch = "aarch64"))'.dev-dependencies]
 simd = "0.1"
 
 [features]
-default = ["alloc", "valgrind"]
+default = ["alloc", "valgrind", "unwind"]
 alloc = []
 valgrind = ["valgrind_request"]
+unwind = []
 
 # These apply only to tests within this library; assembly at -O0 is completely
 # unreadable, so use -O1.
diff --git a/README.md b/README.md
index feec836a..9aaf20f8 100644
--- a/README.md
+++ b/README.md
@@ -117,7 +117,7 @@ there should be at least 8 KiB of free stack space, or panicking will result in
 
 ## Limitations
 
-The architectures currently supported are: x86, x86_64, aarch64, or1k.
+The architectures currently supported are: x86, x86_64, aarch64, arm, or1k.
 
 The platforms currently supported are: bare metal, Linux (any libc),
 FreeBSD, DragonFly BSD, macOS.
@@ -175,13 +175,15 @@ of callee-saved registers.
 
 ### Call stack splicing
 
-Non-Windows platforms use [DWARF][] for both stack unwinding and debugging. DWARF call frame
-information is very generic to be ABI-agnostic—it defines a bytecode that describes the actions
-that need to be performed to simulate returning from a function. libfringe uses this bytecode
-to specify that, after the generator function has returned, execution continues at the point
-where the generator function was resumed the last time.
+Non-Windows platforms use [DWARF][] (or the highly similar [ARM EHABI][ehabi]) for both stack
+unwinding and debugging. DWARF call frame information is very generic to be ABI-agnostic—
+it defines a bytecode that describes the actions that need to be performed to simulate
+returning from a function. libfringe uses this bytecode to specify that, after the generator
+function has returned, execution continues at the point where the generator function was
+resumed the last time.
 
 [dwarf]: http://dwarfstd.org
+[ehabi]: http://infocenter.arm.com/help/topic/com.arm.doc.ihi0038b/IHI0038B_ehabi.pdf
 
 ## Windows compatibility
 
diff --git a/src/arch/aarch64.rs b/src/arch/aarch64.rs
index 58a4a643..d2c623ac 100644
--- a/src/arch/aarch64.rs
+++ b/src/arch/aarch64.rs
@@ -47,14 +47,12 @@
 //   from the stack frame at x29 (in the parent stack), thus continuing
 //   unwinding at the swap call site instead of falling off the end of context stack.
 use core::mem;
-use stack::Stack;
+use arch::StackPointer;
+use unwind;
 
 pub const STACK_ALIGNMENT: usize = 16;
 
-#[derive(Debug, Clone, Copy)]
-pub struct StackPointer(*mut usize);
-
-pub unsafe fn init(stack: &Stack, f: unsafe extern "C" fn(usize, StackPointer) -> !) -> StackPointer {
+pub unsafe fn init(stack_base: *mut u8, f: unsafe fn(usize, StackPointer)) -> StackPointer {
   #[cfg(not(target_vendor = "apple"))]
   #[naked]
   unsafe extern "C" fn trampoline_1() {
@@ -126,16 +124,38 @@ pub unsafe fn init(stack: &Stack, f: unsafe extern "C" fn(usize, StackPointer) -
         # trampoline_2.
         nop
 
-        # Call the provided function.
-        ldr     x2, [sp, #16]
-        blr     x2
-      "#
-      : : : : "volatile")
-  }
+        # Call unwind_wrapper with the provided function and the stack base address.
+        add     x2, sp, #32
+        ldr     x3, [sp, #16]
+        bl      ${0}
+
+        # Restore the stack pointer of the parent context. No CFI adjustments
+        # are needed since we have the same stack frame as trampoline_1.
+        ldr     x2, [sp]
+        mov     sp, x2
+
+        # Load frame and instruction pointers of the parent context.
+        ldp     x29, x30, [sp], #16
+        .cfi_adjust_cfa_offset -16
+        .cfi_restore x29
+        .cfi_restore x30
 
-  unsafe fn push(sp: &mut StackPointer, val: usize) {
-    sp.0 = sp.0.offset(-1);
-    *sp.0 = val
+        # If the returned value is nonzero, trigger an unwind in the parent
+        # context with the given exception object.
+        cbnz    x0, ${1}
+
+        # Clear the stack pointer. We can't call into this context any more once
+        # the function has returned.
+        mov     x1, #0
+
+        # Return into the parent context. Use `br` instead of a `ret` to avoid
+        # return address mispredictions.
+        br      x30
+      "#
+      :
+      : "s" (unwind::unwind_wrapper as usize)
+        "s" (unwind::start_unwind as usize)
+      : : "volatile")
   }
 
   // We set up the stack in a somewhat special way so that to the unwinder it
@@ -146,43 +166,34 @@ pub unsafe fn init(stack: &Stack, f: unsafe extern "C" fn(usize, StackPointer) -
   // followed by the x29 value for that frame. This setup supports unwinding
   // using DWARF CFI as well as the frame pointer-based unwinding used by tools
   // such as perf or dtrace.
-  let mut sp = StackPointer(stack.base() as *mut usize);
+  let mut sp = StackPointer::new(stack_base);
 
-  push(&mut sp, 0 as usize); // Padding to ensure the stack is properly aligned
-  push(&mut sp, f as usize); // Function that trampoline_2 should call
+  sp.push(0 as usize); // Padding to ensure the stack is properly aligned
+  sp.push(f as usize); // Function that trampoline_2 should call
 
   // Call frame for trampoline_2. The CFA slot is updated by swap::trampoline
   // each time a context switch is performed.
-  push(&mut sp, trampoline_1 as usize + 4); // Return after the nop
-  push(&mut sp, 0xdeaddeaddead0cfa);        // CFA slot
+  sp.push(trampoline_1 as usize + 4); // Return after the nop
+  sp.push(0xdeaddeaddead0cfa);        // CFA slot
 
   // Call frame for swap::trampoline. We set up the x29 value to point to the
   // parent call frame.
-  let frame = sp;
-  push(&mut sp, trampoline_2 as usize + 4); // Entry point, skip initial nop
-  push(&mut sp, frame.0 as usize);          // Pointer to parent call frame
+  let frame = sp.offset(0);
+  sp.push(trampoline_2 as usize + 4); // Entry point, skip initial nop
+  sp.push(frame as usize);            // Pointer to parent call frame
 
   sp
 }
 
 #[inline(always)]
-pub unsafe fn swap(arg: usize, new_sp: StackPointer,
-                   new_stack: Option<&Stack>) -> (usize, StackPointer) {
-  // Address of the topmost CFA stack slot.
-  let mut dummy: usize = mem::uninitialized();
-  let new_cfa = if let Some(new_stack) = new_stack {
-    (new_stack.base() as *mut usize).offset(-4)
-  } else {
-    // Just pass a dummy pointer if we aren't linking the stack
-    &mut dummy
-  };
-
+pub unsafe fn swap_link(arg: usize, new_sp: StackPointer,
+                        new_stack_base: *mut u8) -> (usize, Option<StackPointer>) {
   let ret: usize;
-  let ret_sp: *mut usize;
+  let ret_sp: usize;
   asm!(
     r#"
         # Set up the link register
-        adr     lr, 0f
+        adr     x30, 0f
 
         # Save the frame pointer and link register; the unwinder uses them to find
         # the CFA of the caller, and so they have to have the correct value immediately
@@ -194,7 +205,7 @@ pub unsafe fn swap(arg: usize, new_sp: StackPointer,
 
         # Link the call stacks together by writing the current stack bottom
         # address to the CFA slot in the new stack.
-        str     x1, [x3]
+        str     x1, [x3, #-32]
 
         # Load stack pointer of the new context.
         mov     sp, x2
@@ -211,9 +222,9 @@ pub unsafe fn swap(arg: usize, new_sp: StackPointer,
     : "={x0}" (ret)
       "={x1}" (ret_sp)
     : "{x0}" (arg)
-      "{x2}" (new_sp.0)
-      "{x3}" (new_cfa)
-    :/*x0,   "x1",*/"x2",  "x3",  "x4",  "x5",  "x6",  "x7",
+      "{x2}" (new_sp.offset(0))
+      "{x3}" (new_stack_base)
+    :/*"x0", "x1",*/"x2",  "x3",  "x4",  "x5",  "x6",  "x7",
       "x8",  "x9",  "x10", "x11", "x12", "x13", "x14", "x15",
       "x16", "x17", "x18", "x19", "x20", "x21", "x22", "x23",
       "x24", "x25", "x26", "x27", "x28",/*fp,*/ "lr", /*sp,*/
@@ -228,5 +239,76 @@ pub unsafe fn swap(arg: usize, new_sp: StackPointer,
       // the "alignstack" LLVM inline assembly option does exactly the same
       // thing on AArch64.
     : "volatile", "alignstack");
-  (ret, StackPointer(ret_sp))
+  (ret, mem::transmute(ret_sp))
+}
+
+#[inline(always)]
+pub unsafe fn swap(arg: usize, new_sp: StackPointer) -> (usize, StackPointer) {
+  // This is identical to swap_link, but without the write to the CFA slot.
+  let ret: usize;
+  let ret_sp: usize;
+  asm!(
+    r#"
+        adr     x30, 0f
+        stp     x29, x30, [sp, #-16]!
+        mov     x1, sp
+        mov     sp, x2
+        ldp     x29, x30, [sp], #16
+        br      x30
+      0:
+    "#
+    : "={x0}" (ret)
+      "={x1}" (ret_sp)
+    : "{x0}" (arg)
+      "{x2}" (new_sp.offset(0))
+    :/*"x0", "x1",*/"x2",  "x3",  "x4",  "x5",  "x6",  "x7",
+      "x8",  "x9",  "x10", "x11", "x12", "x13", "x14", "x15",
+      "x16", "x17", "x18", "x19", "x20", "x21", "x22", "x23",
+      "x24", "x25", "x26", "x27", "x28",/*fp,*/ "lr", /*sp,*/
+      "v0",  "v1",  "v2",  "v3",  "v4",  "v5",  "v6",  "v7",
+      "v8",  "v9",  "v10", "v11", "v12", "v13", "v14", "v15",
+      "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23",
+      "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31",
+      "cc", "memory"
+    : "volatile", "alignstack");
+  (ret, mem::transmute(ret_sp))
+}
+
+#[inline(always)]
+pub unsafe fn unwind(new_sp: StackPointer, new_stack_base: *mut u8) {
+  // Argument to pass to start_unwind, based on the stack base address.
+  let arg = unwind::unwind_arg(new_stack_base);
+
+  // This is identical to swap_link, except that it performs a tail call to
+  // start_unwind instead of returning into the target context.
+  asm!(
+    r#"
+        adr     x30, 0f
+        stp     x29, x30, [sp, #-16]!
+        mov     x1, sp
+        str     x1, [x3, #-32]
+        mov     sp, x2
+        ldp     x29, x30, [sp], #16
+
+        # Jump to the start_unwind function, which will force a stack unwind in
+        # the target context. This will eventually return to us through the
+        # stack link.
+        b       ${0}
+      0:
+    "#
+    :
+    : "s" (unwind::start_unwind as usize)
+      "{x0}" (arg)
+      "{x2}" (new_sp.offset(0))
+      "{x3}" (new_stack_base)
+    : "x0",  "x1",  "x2",  "x3",  "x4",  "x5",  "x6",  "x7",
+      "x8",  "x9",  "x10", "x11", "x12", "x13", "x14", "x15",
+      "x16", "x17", "x18", "x19", "x20", "x21", "x22", "x23",
+      "x24", "x25", "x26", "x27", "x28",/*fp,*/ "lr", /*sp,*/
+      "v0",  "v1",  "v2",  "v3",  "v4",  "v5",  "v6",  "v7",
+      "v8",  "v9",  "v10", "v11", "v12", "v13", "v14", "v15",
+      "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23",
+      "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31",
+      "cc", "memory"
+    : "volatile", "alignstack");
 }
diff --git a/src/arch/arm.rs b/src/arch/arm.rs
new file mode 100644
index 00000000..b40c293e
--- /dev/null
+++ b/src/arch/arm.rs
@@ -0,0 +1,292 @@
+// This file is part of libfringe, a low-level green threading library.
+// Copyright (c) Nathan Zadoks <nathan@nathan7.eu>,
+//               whitequark <whitequark@whitequark.org>
+//               Amanieu d'Antras <amanieu@gmail.com>
+// Licensed under the Apache License, Version 2.0, <LICENSE-APACHE or
+// http://apache.org/licenses/LICENSE-2.0> or the MIT license <LICENSE-MIT or
+// http://opensource.org/licenses/MIT>, at your option. This file may not be
+// copied, modified, or distributed except according to those terms.
+
+// To understand the machine code in this file, keep in mind these facts:
+// * ARM AAPCS ABI passes the first argument in r0. We also use r0 to pass a value
+//   while swapping context; this is an arbitrary choice
+//   (we clobber all registers and could use any of them) but this allows us
+//   to reuse the swap function to perform the initial call.
+//
+// To understand the ARM EHABI CFI code in this file, keep in mind these facts:
+// * CFI is "call frame information"; a set of instructions to a debugger or
+//   an unwinder that allow it to simulate returning from functions. This implies
+//   restoring every register to its pre-call state, as well as the stack pointer.
+// * CFA is "call frame address"; the value of stack pointer right before the call
+//   instruction in the caller. Everything strictly below CFA (and inclusive until
+//   the next CFA) is the call frame of the callee. This implies that the return
+//   address is the part of callee's call frame.
+// * Logically, ARM EHABI CFI is a table where rows are instruction pointer values and
+//   columns describe where registers are spilled (mostly using expressions that
+//   compute a memory location as CFA+n). A .save pseudoinstruction changes
+//   the state of a column for all IP numerically larger than the one it's placed
+//   after. A .pad or .setfp pseudoinstructions change the CFA value similarly.
+// * Simulating return is as easy as restoring register values from the CFI table
+//   and then setting stack pointer to CFA.
+//
+// A high-level overview of the function of the trampolines is:
+// * The 2nd init trampoline puts a controlled value (written in swap to `new_cfa`)
+//   into r11. This is then used as the CFA for the 1st trampoline.
+// * This controlled value points to the bottom of the stack of the parent context,
+//   which holds the saved r11 and lr from the call to swap().
+// * The 1st init trampoline tells the unwinder to restore r11 and lr
+//   from the stack frame at r11 (in the parent stack), thus continuing
+//   unwinding at the swap call site instead of falling off the end of context stack.
+use core::mem;
+use arch::StackPointer;
+use unwind;
+
+pub const STACK_ALIGNMENT: usize = 8;
+
+pub unsafe fn init(stack_base: *mut u8, f: unsafe fn(usize, StackPointer)) -> StackPointer {
+  #[cfg(not(target_vendor = "apple"))]
+  #[naked]
+  unsafe extern "C" fn trampoline_1() {
+    asm!(
+      r#"
+        # gdb has a hardcoded check that rejects backtraces where frame addresses
+        # do not monotonically decrease. It is turned off if the function is called
+        # "__morestack" and that is hardcoded. So, to make gdb backtraces match
+        # the actual unwinder behavior, we call ourselves "__morestack" and mark
+        # the symbol as local; it shouldn't interfere with anything.
+      __morestack:
+      .local __morestack
+
+        # Set up the first part of our ARM EHABI CFI linking stacks together. When
+        # we reach this function from unwinding, r11 will be pointing at the bottom
+        # of the parent linked stack. This link is set each time swap() is called.
+        # When unwinding the frame corresponding to this function, a ARM EHABI unwinder
+        # will use r11+16 as the next call frame address, restore return address (lr)
+        # from CFA-8 and restore r11 from CFA-16. This mirrors what the second half
+        # of `swap_trampoline` does.
+      # .setfp  fp, sp
+      # .save   {fp, lr}
+
+        # This nop is here so that the initial swap doesn't return to the start
+        # of the trampoline, which confuses the unwinder since it will look for
+        # frame information in the previous symbol rather than this one. It is
+        # never actually executed.
+        nop
+
+      .Lend:
+      .size __morestack, .Lend-__morestack
+      "#
+      : : : : "volatile")
+  }
+
+  #[cfg(target_vendor = "apple")]
+  #[naked]
+  unsafe extern "C" fn trampoline_1() {
+    asm!(
+      r#"
+      # Identical to the above, except avoids .local/.size that aren't available on Mach-O.
+      __morestack:
+      .private_extern __morestack
+      # .setfp  fp, sp
+      # .save   {fp, lr}
+        nop
+      "#
+      : : : : "volatile")
+  }
+
+  #[naked]
+  unsafe extern "C" fn trampoline_2() {
+    asm!(
+      r#"
+        # Set up the second part of our ARM EHABI CFI.
+        # When unwinding the frame corresponding to this function, a DWARF unwinder
+        # will restore r11 (and thus CFA of the first trampoline) from the stack slot.
+        # This stack slot is updated every time swap() is called to point to the bottom
+        # of the stack of the context switch just switched from.
+      # .setfp  fp, sp
+      # .save   {fp, lr}
+
+        # This nop is here so that the return address of the swap trampoline
+        # doesn't point to the start of the symbol. This confuses gdb's backtraces,
+        # causing them to think the parent function is trampoline_1 instead of
+        # trampoline_2.
+        nop
+
+        # Call unwind_wrapper with the provided function and the stack base address.
+        add     r2, sp, #16
+        ldr     r3, [sp, #8]
+        bl      ${0}
+
+        # Restore the stack pointer of the parent context. No CFI adjustments
+        # are needed since we have the same stack frame as trampoline_1.
+        ldr     sp, [sp]
+
+        # Load frame and instruction pointers of the parent context.
+        pop     {fp, lr}
+
+        # If the returned value is nonzero, trigger an unwind in the parent
+        # context with the given exception object.
+        cmp     r0, #0
+        bne     ${1}
+
+        # Clear the stack pointer. We can't call into this context any more once
+        # the function has returned.
+        mov     r1, #0
+
+        # Return into the new context. Use `r12` instead of `lr` to avoid
+        # return address mispredictions.
+        mov     r12, lr
+        bx      r12
+      "#
+      :
+      : "s" (unwind::unwind_wrapper as usize)
+        "s" (unwind::start_unwind as usize)
+      : : "volatile")
+  }
+
+  // We set up the stack in a somewhat special way so that to the unwinder it
+  // looks like trampoline_1 has called trampoline_2, which has in turn called
+  // swap::trampoline.
+  //
+  // There are 2 call frames in this setup, each containing the return address
+  // followed by the r11 value for that frame. This setup supports unwinding
+  // using DWARF CFI as well as the frame pointer-based unwinding used by tools
+  // such as perf or dtrace.
+  let mut sp = StackPointer::new(stack_base);
+
+  sp.push(0 as usize); // Padding to ensure the stack is properly aligned
+  sp.push(f as usize); // Function that trampoline_2 should call
+
+  // Call frame for trampoline_2. The CFA slot is updated by swap::trampoline
+  // each time a context switch is performed.
+  sp.push(trampoline_1 as usize + 4); // Return after the nop
+  sp.push(0xdead0cfa);                // CFA slot
+
+  // Call frame for swap::trampoline. We set up the r11 value to point to the
+  // parent call frame.
+  let frame = sp.offset(0);
+  sp.push(trampoline_2 as usize + 4); // Entry point, skip initial nop
+  sp.push(frame as usize);            // Pointer to parent call frame
+
+  sp
+}
+
+#[inline(always)]
+pub unsafe fn swap_link(arg: usize, new_sp: StackPointer,
+                        new_stack_base: *mut u8) -> (usize, Option<StackPointer>) {
+  let ret: usize;
+  let ret_sp: usize;
+  asm!(
+    r#"
+        # Set up the link register
+        adr     lr, 0f
+
+        # Save the frame pointer and link register; the unwinder uses them to find
+        # the CFA of the caller, and so they have to have the correct value immediately
+        # after the call instruction that invoked the trampoline.
+        push    {fp, lr}
+
+        # Pass the stack pointer of the old context to the new one.
+        mov     r1, sp
+
+        # Link the call stacks together by writing the current stack bottom
+        # address to the CFA slot in the new stack.
+        str     sp, [r3, #-16]
+
+        # Load stack pointer of the new context.
+        mov     sp, r2
+
+        # Load frame and instruction pointers of the new context.
+        pop     {fp, r12}
+
+        # Return into the new context. Use `r12` instead of `lr` to avoid
+        # return address mispredictions.
+        bx      r12
+
+      0:
+    "#
+    : "={r0}" (ret)
+      "={r1}" (ret_sp)
+    : "{r0}" (arg)
+      "{r2}" (new_sp.offset(0))
+      "{r3}" (new_stack_base)
+    :/*r0,    r1,*/ "r2",  "r3",  "r4",  "r5",  "r6",  "r7",
+      "r8",  "r9",  "r10",/*r11,*/"r12",/*sp,*/ "lr", /*pc,*/
+      "d0",  "d1",  "d2",  "d3",  "d4",  "d5",  "d6",  "d7",
+      "d8",  "d9",  "d10", "d11", "d12", "d13", "d14", "d15",
+      "d16", "d17", "d18", "d19", "d20", "d21", "d22", "d23",
+      "d24", "d25", "d26", "d27", "d28", "d29", "d30", "d31",
+      "cc", "memory"
+    : "volatile");
+  (ret, mem::transmute(ret_sp))
+}
+
+#[inline(always)]
+pub unsafe fn swap(arg: usize, new_sp: StackPointer) -> (usize, StackPointer) {
+  // This is identical to swap_link, but without the write to the CFA slot.
+  let ret: usize;
+  let ret_sp: usize;
+  asm!(
+    r#"
+        adr     lr, 0f
+        push    {fp, lr}
+        mov     r1, sp
+        mov     sp, r2
+        pop     {fp, r12}
+        bx      r12
+      0:
+    "#
+    : "={r0}" (ret)
+      "={r1}" (ret_sp)
+    : "{r0}" (arg)
+      "{r2}" (new_sp.offset(0))
+    :/*r0,    r1,*/ "r2",  "r3",  "r4",  "r5",  "r6",  "r7",
+      "r8",  "r9",  "r10",/*r11,*/"r12",/*sp,*/ "lr", /*pc,*/
+      "d0",  "d1",  "d2",  "d3",  "d4",  "d5",  "d6",  "d7",
+      "d8",  "d9",  "d10", "d11", "d12", "d13", "d14", "d15",
+      "d16", "d17", "d18", "d19", "d20", "d21", "d22", "d23",
+      "d24", "d25", "d26", "d27", "d28", "d29", "d30", "d31",
+      "cc", "memory"
+      // We need the "alignstack" attribute here to ensure that the stack is
+      // properly aligned if a call to start_unwind needs to be injected into
+      // our stack context.
+    : "volatile", "alignstack");
+  (ret, mem::transmute(ret_sp))
+}
+
+#[inline(always)]
+pub unsafe fn unwind(new_sp: StackPointer, new_stack_base: *mut u8) {
+  // Argument to pass to start_unwind, based on the stack base address.
+  let arg = unwind::unwind_arg(new_stack_base);
+
+  // This is identical to swap_link, except that it performs a tail call to
+  // start_unwind instead of returning into the target context.
+  asm!(
+    r#"
+        adr     lr, 0f
+        push    {fp, lr}
+        str     sp, [r3, #-16]
+        mov     sp, r2
+        pop     {fp, r12}
+
+        # Jump to the start_unwind function, which will force a stack unwind in
+        # the target context. This will eventually return to us through the
+        # stack link.
+        b       ${0}
+
+      0:
+    "#
+    :
+    : "s" (unwind::start_unwind as usize)
+      "{r0}" (arg)
+      "{r2}" (new_sp.offset(0))
+      "{r3}" (new_stack_base)
+    : "r0",  "r1",  "r2",  "r3",  "r4",  "r5",  "r6",  "r7",
+      "r8",  "r9",  "r10",/*r11,*/"r12",/*sp,*/ "lr", /*pc,*/
+      "d0",  "d1",  "d2",  "d3",  "d4",  "d5",  "d6",  "d7",
+      "d8",  "d9",  "d10", "d11", "d12", "d13", "d14", "d15",
+      "d16", "d17", "d18", "d19", "d20", "d21", "d22", "d23",
+      "d24", "d25", "d26", "d27", "d28", "d29", "d30", "d31",
+      "cc", "memory"
+    : "volatile");
+}
diff --git a/src/arch/mod.rs b/src/arch/mod.rs
index 1ed3aee5..f603980b 100644
--- a/src/arch/mod.rs
+++ b/src/arch/mod.rs
@@ -7,73 +7,98 @@
 // copied, modified, or distributed except according to those terms.
 
 pub use self::imp::*;
+use core::ptr::NonNull;
 
 #[allow(unused_attributes)] // rust-lang/rust#35584
 #[cfg_attr(target_arch = "x86",     path = "x86.rs")]
 #[cfg_attr(target_arch = "x86_64",  path = "x86_64.rs")]
 #[cfg_attr(target_arch = "aarch64", path = "aarch64.rs")]
+#[cfg_attr(target_arch = "arm",     path = "arm.rs")]
 #[cfg_attr(target_arch = "or1k",    path = "or1k.rs")]
 mod imp;
 
+#[derive(Debug, Clone, Copy)]
+pub struct StackPointer(NonNull<usize>);
+
+impl StackPointer {
+  #[inline(always)]
+  pub unsafe fn push(&mut self, val: usize) {
+    self.0 = NonNull::new_unchecked(self.0.as_ptr().offset(-1));
+    *self.0.as_mut() = val;
+  }
+
+  #[inline(always)]
+  pub unsafe fn new(sp: *mut u8) -> StackPointer {
+    StackPointer(NonNull::new_unchecked(sp as *mut usize))
+  }
+
+  #[inline(always)]
+  pub unsafe fn offset(&self, count: isize) -> *mut usize {
+    self.0.as_ptr().offset(count)
+  }
+}
+
 #[cfg(test)]
 mod tests {
   extern crate test;
+  #[cfg(any(target_arch = "x86", target_arch = "x86_64", target_arch = "aarch64"))]
   extern crate simd;
 
   use arch::{self, StackPointer};
-  use ::OsStack;
+  use ::{Stack, OsStack};
 
   #[test]
   fn context() {
-    unsafe extern "C" fn adder(arg: usize, stack_ptr: StackPointer) -> ! {
+    unsafe fn adder(arg: usize, stack_ptr: StackPointer) {
       println!("it's alive! arg: {}", arg);
-      let (arg, stack_ptr) = arch::swap(arg + 1, stack_ptr, None);
+      let (arg, stack_ptr) = arch::swap(arg + 1, stack_ptr);
       println!("still alive! arg: {}", arg);
-      arch::swap(arg + 1, stack_ptr, None);
+      arch::swap(arg + 1, stack_ptr);
       panic!("i should be dead");
     }
 
     unsafe {
       let stack = OsStack::new(4 << 20).unwrap();
-      let stack_ptr = arch::init(&stack, adder);
+      let stack_ptr = arch::init(stack.base(), adder);
 
-      let (ret, stack_ptr) = arch::swap(10, stack_ptr, Some(&stack));
+      let (ret, stack_ptr) = arch::swap_link(10, stack_ptr, stack.base());
       assert_eq!(ret, 11);
-      let (ret, _) = arch::swap(50, stack_ptr, Some(&stack));
+      let (ret, _) = arch::swap_link(50, stack_ptr.unwrap(), stack.base());
       assert_eq!(ret, 51);
     }
   }
 
+  #[cfg(any(target_arch = "x86", target_arch = "x86_64", target_arch = "aarch64"))]
   #[test]
   fn context_simd() {
-    unsafe extern "C" fn permuter(arg: usize, stack_ptr: StackPointer) -> ! {
+    unsafe fn permuter(arg: usize, stack_ptr: StackPointer) {
       // This will crash if the stack is not aligned properly.
       let x = simd::i32x4::splat(arg as i32);
       let y = x * x;
       println!("simd result: {:?}", y);
-      let (_, stack_ptr) = arch::swap(0, stack_ptr, None);
+      let (_, stack_ptr) = arch::swap(0, stack_ptr);
       // And try again after a context switch.
       let x = simd::i32x4::splat(arg as i32);
       let y = x * x;
       println!("simd result: {:?}", y);
-      arch::swap(0, stack_ptr, None);
+      arch::swap(0, stack_ptr);
       panic!("i should be dead");
     }
 
     unsafe {
       let stack = OsStack::new(4 << 20).unwrap();
-      let stack_ptr = arch::init(&stack, permuter);
+      let stack_ptr = arch::init(stack.base(), permuter);
 
-      let (_, stack_ptr) = arch::swap(10, stack_ptr, Some(&stack));
-      arch::swap(20, stack_ptr, Some(&stack));
+      let (_, stack_ptr) = arch::swap_link(10, stack_ptr, stack.base());
+      arch::swap_link(20, stack_ptr.unwrap(), stack.base());
     }
   }
 
-  unsafe extern "C" fn do_panic(arg: usize, stack_ptr: StackPointer) -> ! {
+  unsafe fn do_panic(arg: usize, stack_ptr: StackPointer) {
     match arg {
       0 => panic!("arg=0"),
       1 => {
-        arch::swap(0, stack_ptr, None);
+        arch::swap(0, stack_ptr);
         panic!("arg=1");
       }
       _ => unreachable!()
@@ -85,9 +110,9 @@ mod tests {
   fn panic_after_start() {
     unsafe {
       let stack = OsStack::new(4 << 20).unwrap();
-      let stack_ptr = arch::init(&stack, do_panic);
+      let stack_ptr = arch::init(stack.base(), do_panic);
 
-      arch::swap(0, stack_ptr, Some(&stack));
+      arch::swap_link(0, stack_ptr, stack.base());
     }
   }
 
@@ -96,20 +121,33 @@ mod tests {
   fn panic_after_swap() {
     unsafe {
       let stack = OsStack::new(4 << 20).unwrap();
-      let stack_ptr = arch::init(&stack, do_panic);
+      let stack_ptr = arch::init(stack.base(), do_panic);
+
+      let (_, stack_ptr) = arch::swap_link(1, stack_ptr, stack.base());
+      arch::swap_link(0, stack_ptr.unwrap(), stack.base());
+    }
+  }
+
+  #[test]
+  fn ret() {
+    unsafe fn ret2(_: usize, _: StackPointer) {}
+
+    unsafe {
+      let stack = OsStack::new(4 << 20).unwrap();
+      let stack_ptr = arch::init(stack.base(), ret2);
 
-      let (_, stack_ptr) = arch::swap(1, stack_ptr, Some(&stack));
-      arch::swap(0, stack_ptr, Some(&stack));
+      let (_, stack_ptr) = arch::swap_link(0, stack_ptr, stack.base());
+      assert!(stack_ptr.is_none());
     }
   }
 
   #[bench]
   fn swap(b: &mut test::Bencher) {
-    unsafe extern "C" fn loopback(mut arg: usize, mut stack_ptr: StackPointer) -> ! {
+    unsafe fn loopback(mut arg: usize, mut stack_ptr: StackPointer) {
       // This deliberately does not ignore arg, to measure the time it takes
       // to move the return value between registers.
       loop {
-        let data = arch::swap(arg, stack_ptr, None);
+        let data = arch::swap(arg, stack_ptr);
         arg = data.0;
         stack_ptr = data.1;
       }
@@ -117,10 +155,10 @@ mod tests {
 
     unsafe {
       let stack = OsStack::new(4 << 20).unwrap();
-      let mut stack_ptr = arch::init(&stack, loopback);
+      let mut stack_ptr = arch::init(stack.base(), loopback);
 
       b.iter(|| for _ in 0..10 {
-        stack_ptr = arch::swap(0, stack_ptr, Some(&stack)).1;
+        stack_ptr = arch::swap_link(0, stack_ptr, stack.base()).1.unwrap();
       });
     }
   }
diff --git a/src/arch/or1k.rs b/src/arch/or1k.rs
index b74f2b71..b5b42aa0 100644
--- a/src/arch/or1k.rs
+++ b/src/arch/or1k.rs
@@ -42,14 +42,12 @@
 //   from the stack frame at r2 (in the parent stack), thus continuing
 //   unwinding at the swap call site instead of falling off the end of context stack.
 use core::mem;
-use stack::Stack;
+use arch::StackPointer;
+use unwind;
 
 pub const STACK_ALIGNMENT: usize = 4;
 
-#[derive(Debug, Clone, Copy)]
-pub struct StackPointer(*mut usize);
-
-pub unsafe fn init(stack: &Stack, f: unsafe extern "C" fn(usize, StackPointer) -> !) -> StackPointer {
+pub unsafe fn init(stack_base: *mut u8, f: unsafe fn(usize, StackPointer)) -> StackPointer {
   #[naked]
   unsafe extern "C" fn trampoline_1() {
     asm!(
@@ -104,17 +102,38 @@ pub unsafe fn init(stack: &Stack, f: unsafe extern "C" fn(usize, StackPointer) -
         # trampoline_2.
         l.nop
 
-        # Call the provided function.
-        l.lwz   r5, 8(r1)
-        l.jalr  r5
+        # Call unwind_wrapper with the provided function and the stack base address.
+        l.addi  r5, r1, 12
+        l.lwz   r6, 8(r1)
+        l.jal   ${0}
         l.nop
-      "#
-      : : : : "volatile")
-  }
 
-  unsafe fn push(sp: &mut StackPointer, val: usize) {
-    sp.0 = sp.0.offset(-1);
-    *sp.0 = val
+        # Restore the stack pointer of the parent context. No CFI adjustments
+        # are needed since we have the same stack frame as trampoline_1.
+        l.lwz   r1, 0(r1)
+
+        # Load frame and instruction pointers of the parent context.
+        l.lwz   r2, -4(r1)
+        l.lwz   r9, -8(r1)
+
+        # If the returned value is nonzero, trigger an unwind in the parent
+        # context with the given exception object.
+        l.or    r4, r0, r11
+        l.sfeq  r11, r0
+        l.bf    ${1}
+
+        # Clear the stack pointer. We can't call into this context any more once
+        # the function has returned.
+        l.or    r4, r0, r0
+
+        # Return into the parent context.
+        l.jr    r9
+        l.nop
+      "#
+      :
+      : "s" (unwind::unwind_wrapper as usize)
+        "s" (unwind::start_unwind as usize)
+      : : "volatile")
   }
 
   // We set up the stack in a somewhat special way so that to the unwinder it
@@ -125,20 +144,20 @@ pub unsafe fn init(stack: &Stack, f: unsafe extern "C" fn(usize, StackPointer) -
   // followed by the r2 value for that frame. This setup supports unwinding
   // using DWARF CFI as well as the frame pointer-based unwinding used by tools
   // such as perf or dtrace.
-  let mut sp = StackPointer(stack.base() as *mut usize);
+  let mut sp = StackPointer::new(stack_base);
 
-  push(&mut sp, f as usize); // Function that trampoline_2 should call
+  sp.push(f as usize); // Function that trampoline_2 should call
 
   // Call frame for trampoline_2. The CFA slot is updated by swap::trampoline
   // each time a context switch is performed.
-  push(&mut sp, 0xdead0cfa);                // CFA slot
-  push(&mut sp, trampoline_1 as usize + 4); // Return after the nop
+  sp.push(0xdead0cfa);                // CFA slot
+  sp.push(trampoline_1 as usize + 4); // Return after the nop
 
   // Call frame for swap::trampoline. We set up the r2 value to point to the
   // parent call frame.
   let frame = sp;
-  push(&mut sp, frame.0 as usize);          // Pointer to parent call frame
-  push(&mut sp, trampoline_2 as usize + 4); // Entry point, skip initial nop
+  sp.push(frame.offset(0) as usize);  // Pointer to parent call frame
+  sp.push(trampoline_2 as usize + 4); // Entry point, skip initial nop
 
   // The last two values are read by the swap trampoline and are actually in the
   // red zone and not below the stack pointer.
@@ -146,17 +165,8 @@ pub unsafe fn init(stack: &Stack, f: unsafe extern "C" fn(usize, StackPointer) -
 }
 
 #[inline(always)]
-pub unsafe fn swap(arg: usize, new_sp: StackPointer,
-                   new_stack: Option<&Stack>) -> (usize, StackPointer) {
-  // Address of the topmost CFA stack slot.
-  let mut dummy: usize = mem::uninitialized();
-  let new_cfa = if let Some(new_stack) = new_stack {
-    (new_stack.base() as *mut usize).offset(-2)
-  } else {
-    // Just pass a dummy pointer if we aren't linking the stack
-    &mut dummy
-  };
-
+pub unsafe fn swap_link(arg: usize, new_sp: StackPointer,
+                        new_stack_base: *mut u8) -> (usize, Option<StackPointer>) {
   #[naked]
   unsafe extern "C" fn trampoline() {
     asm!(
@@ -172,14 +182,13 @@ pub unsafe fn swap(arg: usize, new_sp: StackPointer,
         # Link the call stacks together by writing the current stack bottom
         # address to the CFA slot in the new stack.
         l.addi  r7, r1, -8
-        l.sw    0(r6), r7
+        l.sw    -8(r6), r7
 
         # Pass the stack pointer of the old context to the new one.
         l.or    r4, r0, r1
         # Load stack pointer of the new context.
         l.or    r1, r0, r5
 
-        # Restore frame pointer and link register of the new context.
         # Load frame and instruction pointers of the new context.
         l.lwz   r2, -4(r1)
         l.lwz   r9, -8(r1)
@@ -192,7 +201,7 @@ pub unsafe fn swap(arg: usize, new_sp: StackPointer,
   }
 
   let ret: usize;
-  let ret_sp: *mut usize;
+  let ret_sp: usize;
   asm!(
     r#"
       # Call the trampoline to switch to the new context.
@@ -203,13 +212,106 @@ pub unsafe fn swap(arg: usize, new_sp: StackPointer,
       "={r4}" (ret_sp)
     : "s" (trampoline as usize)
       "{r3}" (arg)
-      "{r5}" (new_sp.0)
-      "{r6}" (new_cfa)
+      "{r5}" (new_sp.offset(0))
+      "{r6}" (new_stack_base)
+    :/*"r0", "r1",  "r2",  "r3",  "r4",*/"r5",  "r6",  "r7",
+      "r8",  "r9",  "r10", "r11", "r12", "r13", "r14", "r15",
+      "r16", "r17", "r18", "r19", "r20", "r21", "r22", "r23",
+      "r24", "r25", "r26", "r27", "r28", "r29", "r30", "r31",
+      "cc", "memory"
+    : "volatile");
+  (ret, mem::transmute(ret_sp))
+}
+
+#[inline(always)]
+pub unsafe fn swap(arg: usize, new_sp: StackPointer) -> (usize, StackPointer) {
+  // This is identical to swap_link, but without the write to the CFA slot.
+  #[naked]
+  unsafe extern "C" fn trampoline() {
+    asm!(
+      r#"
+        l.sw    -4(r1), r2
+        l.sw    -8(r1), r9
+        .cfi_offset r2, -4
+        .cfi_offset r9, -8
+        l.or    r4, r0, r1
+        l.or    r1, r0, r5
+        l.lwz   r2, -4(r1)
+        l.lwz   r9, -8(r1)
+        l.jr    r9
+        l.nop
+      "#
+      : : : : "volatile")
+  }
+
+  let ret: usize;
+  let ret_sp: usize;
+  asm!(
+    r#"
+      l.jal   ${2}
+      l.nop
+    "#
+    : "={r3}" (ret)
+      "={r4}" (ret_sp)
+    : "s" (trampoline as usize)
+      "{r3}" (arg)
+      "{r5}" (new_sp.offset(0))
     :/*"r0", "r1",  "r2",  "r3",  "r4",*/"r5",  "r6",  "r7",
       "r8",  "r9",  "r10", "r11", "r12", "r13", "r14", "r15",
       "r16", "r17", "r18", "r19", "r20", "r21", "r22", "r23",
       "r24", "r25", "r26", "r27", "r28", "r29", "r30", "r31",
       "cc", "memory"
     : "volatile");
-  (ret, StackPointer(ret_sp))
+  (ret, mem::transmute(ret_sp))
+}
+
+#[inline(always)]
+pub unsafe fn unwind(new_sp: StackPointer, new_stack_base: *mut u8) {
+  // Argument to pass to start_unwind, based on the stack base address.
+  let arg = unwind::unwind_arg(new_stack_base);
+
+  // This is identical to swap_link, except that it performs a tail call to
+  // start_unwind instead of returning into the target context.
+  #[naked]
+  unsafe extern "C" fn trampoline() {
+    asm!(
+      r#"
+        l.sw    -4(r1), r2
+        l.sw    -8(r1), r9
+        .cfi_offset r2, -4
+        .cfi_offset r9, -8
+        l.addi  r7, r1, -8
+        l.sw    -8(r6), r7
+        l.or    r1, r0, r5
+        l.lwz   r2, -4(r1)
+        l.lwz   r9, -8(r1)
+
+        # Jump to the start_unwind function, which will force a stack unwind in
+        # the target context. This will eventually return to us through the
+        # stack link.
+        l.j     ${0}
+        l.nop
+      "#
+      :
+      : "s" (unwind::start_unwind as usize)
+      : : "volatile")
+  }
+
+  asm!(
+    r#"
+      # Call the trampoline to switch to the new context.
+      l.jal   ${0}
+      l.nop
+    "#
+    :
+    : "s" (trampoline as usize)
+      "{r3}" (arg)
+      "{r5}" (new_sp.offset(0))
+      "{r6}" (new_stack_base)
+    :/*"r0", "r1",  "r2",*/"r3",  "r4",  "r5",  "r6",  "r7",
+      "r8",  "r9",  "r10", "r11", "r12", "r13", "r14", "r15",
+      "r16", "r17", "r18", "r19", "r20", "r21", "r22", "r23",
+      "r24", "r25", "r26", "r27", "r28", "r29", "r30", "r31",
+      "cc", "memory"
+    : "volatile");
 }
diff --git a/src/arch/x86.rs b/src/arch/x86.rs
index 1543a03e..65cf30cf 100644
--- a/src/arch/x86.rs
+++ b/src/arch/x86.rs
@@ -42,14 +42,29 @@
 //   address from the stack frame at %ebp (in the parent stack), thus continuing
 //   unwinding at the swap call site instead of falling off the end of context stack.
 use core::mem;
-use stack::Stack;
+use arch::StackPointer;
+use unwind;
 
 pub const STACK_ALIGNMENT: usize = 16;
 
-#[derive(Debug, Clone, Copy)]
-pub struct StackPointer(*mut usize);
+// Rust's fastcall support is currently broken due to #18086, so we use a
+// custom wrapper instead. We don't quite follow the normal fastcall ABI since
+// we accept the first parameter in %edi rather than the usual %ecx.
+#[naked]
+unsafe extern "C" fn fastcall_start_unwind() {
+  asm!(
+    r#"
+      subl    $$12, %esp
+      .cfi_adjust_cfa_offset 12
+      movl    %edi, (%esp)
+      call    ${0:c}
+    "#
+    :
+    : "s" (unwind::start_unwind as usize)
+    : : "volatile")
+}
 
-pub unsafe fn init(stack: &Stack, f: unsafe extern "C" fn(usize, StackPointer) -> !) -> StackPointer {
+pub unsafe fn init(stack_base: *mut u8, f: unsafe fn(usize, StackPointer)) -> StackPointer {
   #[cfg(not(target_vendor = "apple"))]
   #[naked]
   unsafe extern "C" fn trampoline_1() {
@@ -124,18 +139,44 @@ pub unsafe fn init(stack: &Stack, f: unsafe extern "C" fn(usize, StackPointer) -
         # trampoline_2.
         nop
 
-        # Push arguments.
+        # Call unwind_wrapper with the provided function and the CFA address.
+        leal    16(%esp), %edx
+        pushl   8(%esp)
+        pushl   %edx
         pushl   %esi
         pushl   %edi
-        # Call the provided function.
-        calll  *16(%esp)
-      "#
-      : : : : "volatile")
-  }
+        call    ${0:c}
+
+        # Restore the stack pointer of the parent context. No CFI adjustments
+        # are needed since we have the same stack frame as trampoline_1.
+        movl    16(%esp), %esp
 
-  unsafe fn push(sp: &mut StackPointer, val: usize) {
-    sp.0 = sp.0.offset(-1);
-    *sp.0 = val
+        # Restore frame pointer of the parent context.
+        popl    %ebp
+        .cfi_adjust_cfa_offset -4
+        .cfi_restore %ebp
+
+        # If the returned value is nonzero, trigger an unwind in the parent
+        # context with the given exception object.
+        movl    %eax, %edi
+        testl   %eax, %eax
+        jnz     ${1:c}
+
+        # Clear the stack pointer. We can't call into this context any more once
+        # the function has returned.
+        xorl    %esi, %esi
+
+        # Return into the parent context. Use `pop` and `jmp` instead of a `ret`
+        # to avoid return address mispredictions (~8ns per `ret` on Ivy Bridge).
+        popl    %eax
+        .cfi_adjust_cfa_offset -4
+        .cfi_register %eip, %eax
+        jmpl    *%eax
+      "#
+      :
+      : "s" (unwind::unwind_wrapper as usize)
+        "s" (fastcall_start_unwind as usize)
+      : : "volatile")
   }
 
   // We set up the stack in a somewhat special way so that to the unwinder it
@@ -146,39 +187,28 @@ pub unsafe fn init(stack: &Stack, f: unsafe extern "C" fn(usize, StackPointer) -
   // followed by the %ebp value for that frame. This setup supports unwinding
   // using DWARF CFI as well as the frame pointer-based unwinding used by tools
   // such as perf or dtrace.
-  let mut sp = StackPointer(stack.base() as *mut usize);
+  let mut sp = StackPointer::new(stack_base);
 
-  push(&mut sp, 0 as usize); // Padding to ensure the stack is properly aligned
-  push(&mut sp, 0 as usize); // Padding to ensure the stack is properly aligned
-  push(&mut sp, 0 as usize); // Padding to ensure the stack is properly aligned
-  push(&mut sp, f as usize); // Function that trampoline_2 should call
+  sp.push(0 as usize); // Padding to ensure the stack is properly aligned
+  sp.push(f as usize); // Function that trampoline_2 should call
 
   // Call frame for trampoline_2. The CFA slot is updated by swap::trampoline
   // each time a context switch is performed.
-  push(&mut sp, trampoline_1 as usize + 2); // Return after the 2 nops
-  push(&mut sp, 0xdead0cfa);                // CFA slot
+  sp.push(trampoline_1 as usize + 2); // Return after the 2 nops
+  sp.push(0xdead0cfa);                // CFA slot
 
   // Call frame for swap::trampoline. We set up the %ebp value to point to the
   // parent call frame.
-  let frame = sp;
-  push(&mut sp, trampoline_2 as usize + 1); // Entry point, skip initial nop
-  push(&mut sp, frame.0 as usize);          // Pointer to parent call frame
+  let frame = sp.offset(0);
+  sp.push(trampoline_2 as usize + 1); // Entry point, skip initial nop
+  sp.push(frame as usize);            // Pointer to parent call frame
 
   sp
 }
 
 #[inline(always)]
-pub unsafe fn swap(arg: usize, new_sp: StackPointer,
-                   new_stack: Option<&Stack>) -> (usize, StackPointer) {
-  // Address of the topmost CFA stack slot.
-  let mut dummy: usize = mem::uninitialized();
-  let new_cfa = if let Some(new_stack) = new_stack {
-    (new_stack.base() as *mut usize).offset(-6)
-  } else {
-    // Just pass a dummy pointer if we aren't linking the stack
-    &mut dummy
-  };
-
+pub unsafe fn swap_link(arg: usize, new_sp: StackPointer,
+                        new_stack_base: *mut u8) -> (usize, Option<StackPointer>) {
   #[naked]
   unsafe extern "C" fn trampoline() {
     asm!(
@@ -192,7 +222,7 @@ pub unsafe fn swap(arg: usize, new_sp: StackPointer,
 
         # Link the call stacks together by writing the current stack bottom
         # address to the CFA slot in the new stack.
-        movl    %esp, (%ecx)
+        movl    %esp, -16(%ecx)
 
         # Pass the stack pointer of the old context to the new one.
         movl    %esp, %esi
@@ -215,7 +245,7 @@ pub unsafe fn swap(arg: usize, new_sp: StackPointer,
   }
 
   let ret: usize;
-  let ret_sp: *mut usize;
+  let ret_sp: usize;
   asm!(
     r#"
       # Push instruction pointer of the old context and switch to
@@ -226,12 +256,103 @@ pub unsafe fn swap(arg: usize, new_sp: StackPointer,
       "={esi}" (ret_sp)
     : "s" (trampoline as usize)
       "{edi}" (arg)
-      "{edx}" (new_sp.0)
-      "{ecx}" (new_cfa)
-    : "eax", "ebx", "ecx",  "edx", /*"esi",  "edi", "ebp",  "esp",*/
+      "{edx}" (new_sp.offset(0))
+      "{ecx}" (new_stack_base)
+    : "eax",  "ebx",  "ecx",  "edx",/*"esi",  "edi",  "ebp",  "esp",*/
+      "mm0",  "mm1",  "mm2",  "mm3",  "mm4",  "mm5",  "mm6",  "mm7",
+      "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7",
+      "cc", "dirflag", "fpsr", "flags", "memory"
+    : "volatile");
+  (ret, mem::transmute(ret_sp))
+}
+
+#[inline(always)]
+pub unsafe fn swap(arg: usize, new_sp: StackPointer) -> (usize, StackPointer) {
+  // This is identical to swap_link, but without the write to the CFA slot.
+  #[naked]
+  unsafe extern "C" fn trampoline() {
+    asm!(
+      r#"
+        pushl   %ebp
+        .cfi_adjust_cfa_offset 4
+        .cfi_rel_offset %ebp, 0
+        movl    %esp, %esi
+        movl    %edx, %esp
+        popl    %ebp
+        .cfi_adjust_cfa_offset -4
+        .cfi_restore %ebp
+        popl    %eax
+        .cfi_adjust_cfa_offset -4
+        .cfi_register %eip, %eax
+        jmpl    *%eax
+      "#
+      : : : : "volatile")
+  }
+
+  let ret: usize;
+  let ret_sp: usize;
+  asm!(
+    r#"
+      call    ${2:c}
+    "#
+    : "={edi}" (ret)
+      "={esi}" (ret_sp)
+    : "s" (trampoline as usize)
+      "{edi}" (arg)
+      "{edx}" (new_sp.offset(0))
+    : "eax",  "ebx",  "ecx",  "edx",/*"esi",  "edi",  "ebp",  "esp",*/
+      "mm0",  "mm1",  "mm2",  "mm3",  "mm4",  "mm5",  "mm6",  "mm7",
+      "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7",
+      "cc", "dirflag", "fpsr", "flags", "memory"
+      // We need the "alignstack" attribute here to ensure that the stack is
+      // properly aligned if a call to start_unwind needs to be injected into
+      // our stack context.
+    : "volatile", "alignstack");
+  (ret, mem::transmute(ret_sp))
+}
+
+#[inline(always)]
+pub unsafe fn unwind(new_sp: StackPointer, new_stack_base: *mut u8) {
+  // Argument to pass to start_unwind, based on the stack base address.
+  let arg = unwind::unwind_arg(new_stack_base);
+
+  // This is identical to swap_link, except that it performs a tail call to
+  // start_unwind instead of returning into the target context.
+  #[naked]
+  unsafe extern "C" fn trampoline() {
+    asm!(
+      r#"
+        pushl   %ebp
+        .cfi_adjust_cfa_offset 4
+        .cfi_rel_offset %ebp, 0
+        movl    %esp, -16(%ecx)
+        movl    %edx, %esp
+        popl    %ebp
+        .cfi_adjust_cfa_offset -4
+        .cfi_restore %ebp
+
+        # Jump to the start_unwind function, which will force a stack unwind in
+        # the target context. This will eventually return to us through the
+        # stack link.
+        jmp     ${0:c}
+      "#
+      :
+      : "s" (fastcall_start_unwind as usize)
+      : : "volatile")
+  }
+
+  asm!(
+    r#"
+      call    ${0:c}
+    "#
+    :
+    : "s" (trampoline as usize)
+      "{edi}" (arg)
+      "{edx}" (new_sp.offset(0))
+      "{ecx}" (new_stack_base)
+    : "eax",  "ebx",  "ecx",  "edx",  "esi",  "edi",/*"ebp",  "esp",*/
       "mm0",  "mm1",  "mm2",  "mm3",  "mm4",  "mm5",  "mm6",  "mm7",
       "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7",
       "cc", "dirflag", "fpsr", "flags", "memory"
     : "volatile");
-  (ret, StackPointer(ret_sp))
 }
diff --git a/src/arch/x86_64.rs b/src/arch/x86_64.rs
index 1da7dc28..c69ac61b 100644
--- a/src/arch/x86_64.rs
+++ b/src/arch/x86_64.rs
@@ -47,14 +47,12 @@
 //   address from the stack frame at %rbp (in the parent stack), thus continuing
 //   unwinding at the swap call site instead of falling off the end of context stack.
 use core::mem;
-use stack::Stack;
+use arch::StackPointer;
+use unwind;
 
 pub const STACK_ALIGNMENT: usize = 16;
 
-#[derive(Debug, Clone, Copy)]
-pub struct StackPointer(*mut usize);
-
-pub unsafe fn init(stack: &Stack, f: unsafe extern "C" fn(usize, StackPointer) -> !) -> StackPointer {
+pub unsafe fn init(stack_base: *mut u8, f: unsafe fn(usize, StackPointer)) -> StackPointer {
   #[cfg(not(target_vendor = "apple"))]
   #[naked]
   unsafe extern "C" fn trampoline_1() {
@@ -129,15 +127,41 @@ pub unsafe fn init(stack: &Stack, f: unsafe extern "C" fn(usize, StackPointer) -
         # trampoline_2.
         nop
 
-        # Call the provided function.
-        call    *16(%rsp)
-      "#
-      : : : : "volatile")
-  }
+        # Call unwind_wrapper with the provided function and the stack base address.
+        leaq    32(%rsp), %rdx
+        movq    16(%rsp), %rcx
+        call    ${0:c}
+
+        # Restore the stack pointer of the parent context. No CFI adjustments
+        # are needed since we have the same stack frame as trampoline_1.
+        movq    (%rsp), %rsp
+
+        # Restore frame pointer of the parent context.
+        popq    %rbp
+        .cfi_adjust_cfa_offset -8
+        .cfi_restore %rbp
+
+        # If the returned value is nonzero, trigger an unwind in the parent
+        # context with the given exception object.
+        movq    %rax, %rdi
+        testq   %rax, %rax
+        jnz     ${1:c}
 
-  unsafe fn push(sp: &mut StackPointer, val: usize) {
-    sp.0 = sp.0.offset(-1);
-    *sp.0 = val
+        # Clear the stack pointer. We can't call into this context any more once
+        # the function has returned.
+        xorq    %rsi, %rsi
+
+        # Return into the parent context. Use `pop` and `jmp` instead of a `ret`
+        # to avoid return address mispredictions (~8ns per `ret` on Ivy Bridge).
+        popq    %rax
+        .cfi_adjust_cfa_offset -8
+        .cfi_register %rip, %rax
+        jmpq    *%rax
+      "#
+      :
+      : "s" (unwind::unwind_wrapper as usize)
+        "s" (unwind::start_unwind as usize)
+      : : "volatile")
   }
 
   // We set up the stack in a somewhat special way so that to the unwinder it
@@ -148,39 +172,30 @@ pub unsafe fn init(stack: &Stack, f: unsafe extern "C" fn(usize, StackPointer) -
   // followed by the %rbp value for that frame. This setup supports unwinding
   // using DWARF CFI as well as the frame pointer-based unwinding used by tools
   // such as perf or dtrace.
-  let mut sp = StackPointer(stack.base() as *mut usize);
+  let mut sp = StackPointer::new(stack_base);
 
-  push(&mut sp, 0 as usize); // Padding to ensure the stack is properly aligned
-  push(&mut sp, f as usize); // Function that trampoline_2 should call
+  sp.push(0 as usize); // Padding to ensure the stack is properly aligned
+  sp.push(f as usize); // Function that trampoline_2 should call
 
   // Call frame for trampoline_2. The CFA slot is updated by swap::trampoline
   // each time a context switch is performed.
-  push(&mut sp, trampoline_1 as usize + 2); // Return after the 2 nops
-  push(&mut sp, 0xdeaddeaddead0cfa);        // CFA slot
+  sp.push(trampoline_1 as usize + 2); // Return after the 2 nops
+  sp.push(0xdeaddeaddead0cfa);        // CFA slot
 
   // Call frame for swap::trampoline. We set up the %rbp value to point to the
   // parent call frame.
-  let frame = sp;
-  push(&mut sp, trampoline_2 as usize + 1); // Entry point, skip initial nop
-  push(&mut sp, frame.0 as usize);          // Pointer to parent call frame
+  let frame = sp.offset(0);
+  sp.push(trampoline_2 as usize + 1); // Entry point, skip initial nop
+  sp.push(frame as usize);            // Pointer to parent call frame
 
   sp
 }
 
 #[inline(always)]
-pub unsafe fn swap(arg: usize, new_sp: StackPointer,
-                   new_stack: Option<&Stack>) -> (usize, StackPointer) {
-  // Address of the topmost CFA stack slot.
-  let mut dummy: usize = mem::uninitialized();
-  let new_cfa = if let Some(new_stack) = new_stack {
-    (new_stack.base() as *mut usize).offset(-4)
-  } else {
-    // Just pass a dummy pointer if we aren't linking the stack
-    &mut dummy
-  };
-
+pub unsafe fn swap_link(arg: usize, new_sp: StackPointer,
+                        new_stack_base: *mut u8) -> (usize, Option<StackPointer>) {
   let ret: usize;
-  let ret_sp: *mut usize;
+  let ret_sp: usize;
   asm!(
     r#"
         # Push the return address
@@ -194,7 +209,7 @@ pub unsafe fn swap(arg: usize, new_sp: StackPointer,
 
         # Link the call stacks together by writing the current stack bottom
         # address to the CFA slot in the new stack.
-        movq    %rsp, (%rcx)
+        movq    %rsp, -32(%rcx)
 
         # Pass the stack pointer of the old context to the new one.
         movq    %rsp, %rsi
@@ -215,8 +230,8 @@ pub unsafe fn swap(arg: usize, new_sp: StackPointer,
     : "={rdi}" (ret)
       "={rsi}" (ret_sp)
     : "{rdi}" (arg)
-      "{rdx}" (new_sp.0)
-      "{rcx}" (new_cfa)
+      "{rdx}" (new_sp.offset(0))
+      "{rcx}" (new_stack_base)
     : "rax",   "rbx",   "rcx",   "rdx", /*"rsi",   "rdi",   "rbp",   "rsp",*/
       "r8",    "r9",    "r10",   "r11",   "r12",   "r13",   "r14",   "r15",
       "mm0",   "mm1",   "mm2",   "mm3",   "mm4",   "mm5",   "mm6",   "mm7",
@@ -231,5 +246,76 @@ pub unsafe fn swap(arg: usize, new_sp: StackPointer,
       // the "alignstack" LLVM inline assembly option does exactly the same
       // thing on x86_64.
     : "volatile", "alignstack");
-  (ret, StackPointer(ret_sp))
+  (ret, mem::transmute(ret_sp))
+}
+
+#[inline(always)]
+pub unsafe fn swap(arg: usize, new_sp: StackPointer) -> (usize, StackPointer) {
+  // This is identical to swap_link, but without the write to the CFA slot.
+  let ret: usize;
+  let ret_sp: usize;
+  asm!(
+    r#"
+        leaq    0f(%rip), %rax
+        pushq   %rax
+        pushq   %rbp
+        movq    %rsp, %rsi
+        movq    %rdx, %rsp
+        popq    %rbp
+        popq    %rax
+        jmpq    *%rax
+      0:
+    "#
+    : "={rdi}" (ret)
+      "={rsi}" (ret_sp)
+    : "{rdi}" (arg)
+      "{rdx}" (new_sp.offset(0))
+    : "rax",   "rbx",   "rcx",   "rdx", /*"rsi",   "rdi",   "rbp",   "rsp",*/
+      "r8",    "r9",    "r10",   "r11",   "r12",   "r13",   "r14",   "r15",
+      "mm0",   "mm1",   "mm2",   "mm3",   "mm4",   "mm5",   "mm6",   "mm7",
+      "xmm0",  "xmm1",  "xmm2",  "xmm3",  "xmm4",  "xmm5",  "xmm6",  "xmm7",
+      "xmm8",  "xmm9",  "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15",
+      "xmm16", "xmm17", "xmm18", "xmm19", "xmm20", "xmm21", "xmm22", "xmm23",
+      "xmm24", "xmm25", "xmm26", "xmm27", "xmm28", "xmm29", "xmm30", "xmm31",
+      "cc", "dirflag", "fpsr", "flags", "memory"
+    : "volatile", "alignstack");
+  (ret, mem::transmute(ret_sp))
+}
+
+#[inline(always)]
+pub unsafe fn unwind(new_sp: StackPointer, new_stack_base: *mut u8) {
+  // Argument to pass to start_unwind, based on the stack base address.
+  let arg = unwind::unwind_arg(new_stack_base);
+
+  // This is identical to swap_link, except that it performs a tail call to
+  // start_unwind instead of returning into the target context.
+  asm!(
+    r#"
+        leaq    0f(%rip), %rax
+        pushq   %rax
+        pushq   %rbp
+        movq    %rsp, -32(%rcx)
+        movq    %rdx, %rsp
+        popq    %rbp
+
+        # Jump to the start_unwind function, which will force a stack unwind in
+        # the target context. This will eventually return to us through the
+        # stack link.
+        jmp     ${0:c}
+      0:
+    "#
+    :
+    : "s" (unwind::start_unwind as usize)
+      "{rdi}" (arg)
+      "{rdx}" (new_sp.offset(0))
+      "{rcx}" (new_stack_base)
+    : "rax",   "rbx",   "rcx",   "rdx",   "rsi",   "rdi", /*"rbp",   "rsp",*/
+      "r8",    "r9",    "r10",   "r11",   "r12",   "r13",   "r14",   "r15",
+      "mm0",   "mm1",   "mm2",   "mm3",   "mm4",   "mm5",   "mm6",   "mm7",
+      "xmm0",  "xmm1",  "xmm2",  "xmm3",  "xmm4",  "xmm5",  "xmm6",  "xmm7",
+      "xmm8",  "xmm9",  "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15",
+      "xmm16", "xmm17", "xmm18", "xmm19", "xmm20", "xmm21", "xmm22", "xmm23",
+      "xmm24", "xmm25", "xmm26", "xmm27", "xmm28", "xmm29", "xmm30", "xmm31",
+      "cc", "dirflag", "fpsr", "flags", "memory"
+    : "volatile", "alignstack");
 }
diff --git a/src/generator.rs b/src/generator.rs
index e8adb8f6..207de5ad 100644
--- a/src/generator.rs
+++ b/src/generator.rs
@@ -19,6 +19,36 @@ use stack;
 use debug;
 use arch::{self, StackPointer};
 
+// Wrapper to prevent the compiler from automatically dropping a value when it
+// goes out of scope. This is particularly useful when dealing with unwinding
+// since mem::forget won't be executed when unwinding.
+#[allow(unions_with_drop_fields)]
+union NoDrop<T> {
+  inner: T,
+}
+
+// Try to pack a value into a usize if it fits, otherwise pass its address as a usize.
+unsafe fn encode_usize<T>(val: &NoDrop<T>) -> usize {
+  if mem::size_of::<T>() <= mem::size_of::<usize>() &&
+     mem::align_of::<T>() <= mem::align_of::<usize>() {
+    let mut out = 0;
+    ptr::copy_nonoverlapping(&val.inner, &mut out as *mut usize as *mut T, 1);
+    out
+  } else {
+    &val.inner as *const T as usize
+  }
+}
+
+// Unpack a usize produced by encode_usize.
+unsafe fn decode_usize<T>(val: usize) -> T {
+  if mem::size_of::<T>() <= mem::size_of::<usize>() &&
+     mem::align_of::<T>() <= mem::align_of::<usize>() {
+    ptr::read(&val as *const usize as *const T)
+  } else {
+    ptr::read(val as *const T)
+  }
+}
+
 #[derive(Debug, Clone, Copy, PartialEq, Eq)]
 pub enum State {
   /// Generator can be resumed. This is the initial state.
@@ -83,18 +113,12 @@ pub enum State {
 /// ```
 #[derive(Debug)]
 pub struct Generator<'a, Input: 'a, Output: 'a, Stack: stack::Stack> {
-  state:     State,
   stack:     NoDrop<Stack>,
-  stack_id:  NoDrop<debug::StackId>,
-  stack_ptr: arch::StackPointer,
+  stack_id:  debug::StackId,
+  stack_ptr: Option<arch::StackPointer>,
   phantom:   PhantomData<(&'a (), *mut Input, *const Output)>
 }
 
-#[allow(unions_with_drop_fields)]
-union NoDrop<T> {
-  inner: T
-}
-
 impl<T: ::core::fmt::Debug> ::core::fmt::Debug for NoDrop<T> {
   fn fmt(&self, w: &mut ::core::fmt::Formatter) -> ::core::fmt::Result {
     unsafe {
@@ -124,31 +148,28 @@ impl<'a, Input, Output, Stack> Generator<'a, Input, Output, Stack>
   /// See also the [contract](../trait.Stack.html) that needs to be fulfilled by `stack`.
   pub unsafe fn unsafe_new<F>(stack: Stack, f: F) -> Generator<'a, Input, Output, Stack>
       where F: FnOnce(&Yielder<Input, Output>, Input) + 'a {
-    unsafe extern "C" fn generator_wrapper<Input, Output, Stack, F>(env: usize, stack_ptr: StackPointer) -> !
+    unsafe fn generator_wrapper<Input, Output, Stack, F>(env: usize, stack_ptr: StackPointer)
         where Stack: stack::Stack, F: FnOnce(&Yielder<Input, Output>, Input) {
       // Retrieve our environment from the callee and return control to it.
-      let f = ptr::read(env as *const F);
-      let (data, stack_ptr) = arch::swap(0, stack_ptr, None);
+      let f: F = decode_usize(env);
+      let (data, stack_ptr) = arch::swap(0, stack_ptr);
       // See the second half of Yielder::suspend_bare.
-      let input = ptr::read(data as *const Input);
+      let input = decode_usize(data);
       // Run the body of the generator.
       let yielder = Yielder::new(stack_ptr);
       f(&yielder, input);
-      // Past this point, the generator has dropped everything it has held.
-      loop { yielder.suspend_bare(None); }
     }
 
     let stack_id  = debug::StackId::register(&stack);
-    let stack_ptr = arch::init(&stack, generator_wrapper::<Input, Output, Stack, F>);
+    let stack_ptr = arch::init(stack.base(), generator_wrapper::<Input, Output, Stack, F>);
 
     // Transfer environment to the callee.
-    let stack_ptr = arch::swap(&f as *const F as usize, stack_ptr, Some(&stack)).1;
-    mem::forget(f);
+    let f = NoDrop { inner: f };
+    let stack_ptr = arch::swap_link(encode_usize(&f), stack_ptr, stack.base()).1;
 
     Generator {
-      state:     State::Runnable,
       stack:     NoDrop { inner: stack },
-      stack_id:  NoDrop { inner: stack_id },
+      stack_id:  stack_id,
       stack_ptr: stack_ptr,
       phantom:   PhantomData
     }
@@ -158,44 +179,38 @@ impl<'a, Input, Output, Stack> Generator<'a, Input, Output, Stack>
   /// If the generator function has returned, returns `None`.
   #[inline]
   pub fn resume(&mut self, input: Input) -> Option<Output> {
-    match self.state {
-      State::Runnable => {
-        // Set the state to Unavailable. Since we have exclusive access to the generator,
-        // the only case where this matters is the generator function panics, after which
-        // it must not be invocable again.
-        self.state = State::Unavailable;
+    // Return None if we have no stack pointer (generator function already returned).
+    self.stack_ptr.and_then(|stack_ptr| {
+      // Set the state to Unavailable. Since we have exclusive access to the generator,
+      // the only case where this matters is the generator function panics, after which
+      // it must not be invocable again.
+      self.stack_ptr = None;
 
-        // Switch to the generator function, and retrieve the yielded value.
-        let val = unsafe {
-          let (data_out, stack_ptr) = arch::swap(&input as *const Input as usize, self.stack_ptr, Some(&self.stack.inner));
-          self.stack_ptr = stack_ptr;
-          mem::forget(input);
-          ptr::read(data_out as *const Option<Output>)
-        };
+      // Switch to the generator function, and retrieve the yielded value.
+      unsafe {
+        let input = NoDrop { inner: input };
+        let (data_out, stack_ptr) = arch::swap_link(encode_usize(&input), stack_ptr, self.stack.inner.base());
+        self.stack_ptr = stack_ptr;
 
-        // Unless the generator function has returned, it can be switched to again, so
-        // set the state to Runnable.
-        if val.is_some() { self.state = State::Runnable }
-
-        val
+        // If the generator function has finished, return None, otherwise return the
+        // yielded value.
+        stack_ptr.map(|_| decode_usize(data_out))
       }
-      State::Unavailable => None
-    }
+    })
   }
 
   /// Returns the state of the generator.
   #[inline]
-  pub fn state(&self) -> State { self.state }
+  pub fn state(&self) -> State {
+    if self.stack_ptr.is_some() { State::Runnable } else { State::Unavailable }
+  }
 
   /// Extracts the stack from a generator when the generator function has returned.
   /// If the generator function has not returned
   /// (i.e. `self.state() == State::Runnable`), panics.
   pub fn unwrap(self) -> Stack {
-    match self.state {
-      State::Runnable => {
-        mem::forget(self);
-        panic!("Argh! Bastard! Don't touch that!")
-      }
+    match self.state() {
+      State::Runnable => panic!("Argh! Bastard! Don't touch that!"),
       State::Unavailable => unsafe { self.unsafe_unwrap() }
     }
   }
@@ -203,8 +218,13 @@ impl<'a, Input, Output, Stack> Generator<'a, Input, Output, Stack>
   /// Extracts the stack from a generator without checking if the generator function has returned.
   /// This will leave any pointers into the generator stack dangling, and won't run destructors.
   pub unsafe fn unsafe_unwrap(mut self) -> Stack {
-    ptr::drop_in_place(&mut self.stack_id.inner);
-    let stack = ptr::read(&mut self.stack.inner);
+    if cfg!(feature = "unwind") {
+      self.stack_ptr.map(|stack_ptr| arch::unwind(stack_ptr, self.stack.inner.base()));
+    }
+
+    // We can't just return self.stack since Generator has a Drop impl
+    let stack = ptr::read(&self.stack.inner);
+    ptr::drop_in_place(&mut self.stack_id);
     mem::forget(self);
     stack
   }
@@ -214,10 +234,15 @@ impl<'a, Input, Output, Stack> Drop for Generator<'a, Input, Output, Stack>
     where Input: 'a, Output: 'a, Stack: stack::Stack {
   fn drop(&mut self) {
     unsafe {
-      ptr::drop_in_place(&mut self.stack_id.inner);
-      match self.state {
-        State::Runnable    => panic!("dropped unfinished Generator"),
-        State::Unavailable => ptr::drop_in_place(&mut self.stack.inner)
+      match self.stack_ptr {
+        Some(stack_ptr) => {
+          // If unwinding is not available then we have to leak the stack.
+          if cfg!(feature = "unwind") {
+            arch::unwind(stack_ptr, self.stack.inner.base());
+            ptr::drop_in_place(&mut self.stack.inner);
+          }
+        }
+        None => ptr::drop_in_place(&mut self.stack.inner)
       }
     }
   }
@@ -227,33 +252,43 @@ impl<'a, Input, Output, Stack> Drop for Generator<'a, Input, Output, Stack>
 /// returns a value.
 #[derive(Debug)]
 pub struct Yielder<Input, Output> {
-  stack_ptr: Cell<StackPointer>,
+  stack_ptr: Cell<Option<StackPointer>>,
   phantom: PhantomData<(*const Input, *mut Output)>
 }
 
 impl<Input, Output> Yielder<Input, Output> {
   fn new(stack_ptr: StackPointer) -> Yielder<Input, Output> {
     Yielder {
-      stack_ptr: Cell::new(stack_ptr),
+      stack_ptr: Cell::new(Some(stack_ptr)),
       phantom: PhantomData
     }
   }
 
-  #[inline(always)]
-  fn suspend_bare(&self, val: Option<Output>) -> Input {
-    unsafe {
-      let (data, stack_ptr) = arch::swap(&val as *const Option<Output> as usize, self.stack_ptr.get(), None);
-      self.stack_ptr.set(stack_ptr);
-      mem::forget(val);
-      ptr::read(data as *const Input)
-    }
-  }
-
   /// Suspends the generator and returns `Some(item)` from the `resume()`
   /// invocation that resumed the generator.
   #[inline(always)]
   pub fn suspend(&self, item: Output) -> Input {
-    self.suspend_bare(Some(item))
+    unsafe {
+      struct PanicGuard<'a>(&'a Cell<Option<StackPointer>>);
+      impl<'a> Drop for PanicGuard<'a> {
+        fn drop(&mut self) {
+          self.0.set(None);
+        }
+      }
+
+      let stack_ptr = self.stack_ptr.get().expect("attempted to yield while unwinding");
+      let item = NoDrop { inner: item };
+
+      // Use a PanicGuard to set self.stack_ptr to None if unwinding occurs. This
+      // is necessary to guarantee safety in case someone tries to call yield
+      // while we are unwinding since there is nowhere to yield to.
+      let guard = PanicGuard(&self.stack_ptr);
+      let (data, stack_ptr) = arch::swap(encode_usize(&item), stack_ptr);
+      mem::forget(guard);
+
+      self.stack_ptr.set(Some(stack_ptr));
+      decode_usize(data)
+    }
   }
 }
 
diff --git a/src/lib.rs b/src/lib.rs
index af4cf14b..c24dac00 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -4,7 +4,8 @@
 // http://apache.org/licenses/LICENSE-2.0> or the MIT license <LICENSE-MIT or
 // http://opensource.org/licenses/MIT>, at your option. This file may not be
 // copied, modified, or distributed except according to those terms.
-#![feature(asm, naked_functions, cfg_target_vendor, untagged_unions)]
+
+#![feature(asm, naked_functions, cfg_target_vendor, untagged_unions, unwind_attributes)]
 #![cfg_attr(feature = "alloc", feature(alloc, heap_api, allocator_api))]
 #![cfg_attr(test, feature(test))]
 #![no_std]
@@ -43,6 +44,13 @@ pub const STACK_ALIGNMENT: usize = arch::STACK_ALIGNMENT;
 
 mod debug;
 
+#[cfg(feature = "unwind")]
+#[path = "unwind.rs"]
+mod unwind;
+#[cfg(not(feature = "unwind"))]
+#[path = "no_unwind.rs"]
+mod unwind;
+
 pub mod generator;
 
 mod stack;
diff --git a/src/no_unwind.rs b/src/no_unwind.rs
new file mode 100644
index 00000000..24fac7f0
--- /dev/null
+++ b/src/no_unwind.rs
@@ -0,0 +1,23 @@
+// This file is part of libfringe, a low-level green threading library.
+// Copyright (c) Amanieu d'Antras <amanieu@gmail.com>,
+// Licensed under the Apache License, Version 2.0, <LICENSE-APACHE or
+// http://apache.org/licenses/LICENSE-2.0> or the MIT license <LICENSE-MIT or
+// http://opensource.org/licenses/MIT>, at your option. This file may not be
+// copied, modified, or distributed except according to those terms.
+
+use arch::StackPointer;
+
+pub unsafe extern "C" fn unwind_wrapper(arg: usize, sp: StackPointer, _stack_base: *mut u8,
+                                        f: unsafe fn(usize, StackPointer)) -> usize {
+  f(arg, sp);
+  0
+}
+
+pub unsafe extern "C" fn start_unwind(_panic: usize) -> ! {
+  unreachable!();
+}
+
+#[inline]
+pub fn unwind_arg(_stack_base: *mut u8) -> usize {
+  unreachable!();
+}
diff --git a/src/stack/os/mod.rs b/src/stack/os/mod.rs
index 7c3fdd61..af6e44bb 100644
--- a/src/stack/os/mod.rs
+++ b/src/stack/os/mod.rs
@@ -29,8 +29,8 @@ impl OsStack {
   pub fn new(size: usize) -> Result<OsStack, IoError> {
     let page_size = sys::page_size();
 
-    // Stacks have to be at least one page long.
-    let len = if size == 0 { page_size } else { size };
+    // Stacks have to be at least 16KB to support unwinding.
+    let len = if size == 0 { 16384 } else { size };
 
     // Round the length one page size up, using the fact that the page size
     // is a power of two.
diff --git a/src/unwind.rs b/src/unwind.rs
new file mode 100644
index 00000000..0d018f3d
--- /dev/null
+++ b/src/unwind.rs
@@ -0,0 +1,81 @@
+// This file is part of libfringe, a low-level green threading library.
+// Copyright (c) Amanieu d'Antras <amanieu@gmail.com>,
+// Licensed under the Apache License, Version 2.0, <LICENSE-APACHE or
+// http://apache.org/licenses/LICENSE-2.0> or the MIT license <LICENSE-MIT or
+// http://opensource.org/licenses/MIT>, at your option. This file may not be
+// copied, modified, or distributed except according to those terms.
+extern crate std;
+
+use self::std::panic;
+use self::std::boxed::Box;
+use core::any::Any;
+use arch::StackPointer;
+
+// Marker object that is passed through the stack unwinding
+struct UnwindMarker {
+  // We use the stack base as an identifier so that nested generators are handled
+  // correctly. When unwinding, we will want to continue through any number of
+  // nested generators until we reach the one with a matching identifier.
+  stack_base: *mut u8,
+}
+unsafe impl Send for UnwindMarker {}
+
+// Whether the current platform support unwinding across multiple stacks.
+#[inline]
+fn have_cross_stack_unwind() -> bool {
+  // - Windows uses SEH for unwinding instead of libunwind. While it may be
+  //   possible to munge it so support cross-stack unwinding, we stay conservative
+  //   for now.
+  // - iOS on ARM uses setjmp/longjmp instead of DWARF-2 unwinding, which needs
+  //   to be explicitly saved/restored when switching contexts.
+  // - LLVM doesn't currently support ARM EHABI directives in inline assembly so
+  //   we instead need to propagate exceptions manually across contexts.
+  !(cfg!(windows) || cfg!(target_arch = "arm"))
+}
+
+// Wrapper around the root function of a generator which handles unwinding.
+#[unwind(allowed)]
+pub unsafe extern "C" fn unwind_wrapper(arg: usize, sp: StackPointer, stack_base: *mut u8,
+                                        f: unsafe fn(usize, StackPointer)) -> Option<Box<Box<Any + Send>>> {
+  // Catch any attempts to unwind out of the context.
+  match panic::catch_unwind(move || f(arg, sp)) {
+    Ok(_) => None,
+    Err(err) => {
+      // If the unwinding is due to an UnwindMarker, check whether it is intended
+      // for us by comparing the stack base of the caller with ours. If it is the
+      // same then we can swallow the exception and return to the caller normally.
+      if let Some(marker) = err.downcast_ref::<UnwindMarker>() {
+        if marker.stack_base == stack_base {
+          return None;
+        }
+      }
+
+      // Otherwise, propagate the panic to the parent context.
+      if have_cross_stack_unwind() {
+        panic::resume_unwind(err)
+      } else {
+        // The assembly code will call start_unwind in the parent context and
+        // pass it this Box as parameter.
+        Some(Box::new(err))
+      }
+    }
+  }
+}
+
+// Called by asm to start unwinding in the current context with the given
+// exception object.
+#[unwind(allowed)]
+pub unsafe extern "C" fn start_unwind(panic: Box<Box<Any + Send>>) -> ! {
+  // Use resume_unwind instead of panic! to avoid printing a message.
+  panic::resume_unwind(*panic)
+}
+
+// Get the initial argument to pass to start_unwind, keyed to the base address
+// of the generator stack that is going to be unwound.
+#[inline]
+pub fn unwind_arg(stack_base: *mut u8) -> usize {
+  let marker = UnwindMarker {
+    stack_base: stack_base
+  };
+  Box::into_raw(Box::new(Box::new(marker) as Box<Any + Send>)) as usize
+}
diff --git a/tests/generator.rs b/tests/generator.rs
index d230153a..d243c780 100644
--- a/tests/generator.rs
+++ b/tests/generator.rs
@@ -68,7 +68,7 @@ fn panic_safety() {
 
 #[test]
 fn with_slice_stack() {
-  let mut memory = [0; 1024];
+  let mut memory = [0; 16384];
   let stack = SliceStack::new(&mut memory);
   let mut add_one = unsafe { Generator::unsafe_new(stack, add_one_fn) };
   assert_eq!(add_one.resume(1), Some(2));
@@ -78,7 +78,7 @@ fn with_slice_stack() {
 
 #[test]
 fn with_owned_stack() {
-  let stack = OwnedStack::new(1024);
+  let stack = OwnedStack::new(16384);
   let mut add_one = unsafe { Generator::unsafe_new(stack, add_one_fn) };
   assert_eq!(add_one.resume(1), Some(2));
   assert_eq!(add_one.resume(2), Some(3));